我目前正在构建用于声音分类的 CNN。问题相对简单:我需要我的模型来检测音频记录中是否有人类语音。我制作了一个训练/测试集,其中包含 3 秒的记录,其中有人类语音(语音)或没有(no_speech)。从这 3 秒的片段中,我得到了一个尺寸为 128 x 128 的梅尔谱图,用于为模型提供数据。
由于这是一个简单的二元问题,我认为 CNN 可以轻松检测到人类语音,但我可能太自大了。然而,似乎在 1 或 2 个 epoch 之后,模型不再学习,即损失不会减少,就好像权重没有更新一样,正确预测的数量大致保持不变。我尝试使用超参数,但问题仍然存在。我尝试了 0.1、0.01 的学习率……直到 1e-7。我也尝试使用更复杂的模型,但同样发生。
然后我认为这可能是由于脚本本身造成的,但我找不到任何错误:计算损失,然后使用 backward()
计算梯度,并且应该更新权重。我很高兴你能快速浏览一下脚本,让我知道可能出了什么问题!如果您对为什么会出现此问题有其他想法,我也很高兴收到一些有关如何最好地训练我的 CNN 的建议。
我根据 Stevens 的“Deep learning in PyTorch”中的 LunaTrainingApp 编写脚本,因为我发现脚本很优雅。当然,我修改了它以匹配我的问题,我添加了一种计算精度和召回率的方法以及一些其他自定义指标,例如正确预测的百分比。
脚本如下:
import torch
import torch.nn as nn
import argparse
import numpy as np
import logging
logging.basicConfig(level = logging.INFO)
log = logging.getLogger(__name__)
from torch.optim import SGD
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
from sklearn.metrics import confusion_matrix
from dataset_loader.audiodataset import AudioDataset
from models.vadnet import VADNet
from utils.earlystopping import EarlyStopping
class VADTrainingApp:
def __init__(self, sys_argv=None):
parser = argparse.ArgumentParser()
parser.add_argument("--train_path",
help='Path to the training set',
required=True,
type=str,
)
parser.add_argument("--test_path",
help='Path to the testing set',
required=True,
type=str,
)
parser.add_argument("--save_path",
help='Path to saving the model',
required=True,
type=str,
)
parser.add_argument("--save_es",
help='Save the checkpoints of early stopping call',
default="checkpoint.pt",
type=str,
)
parser.add_argument('--num-workers',
help='Number of worker processes for background data loading',
default=8,
type=int,
)
parser.add_argument("--batch_size",
help='Batch size to use for training',
default=32,
type=int,)
parser.add_argument('--epochs',
help='Number of epochs to train for',
default=50,
type=int,
)
parser.add_argument('--lr',
help='Learning rate for th stochastic gradient descent',
default=0.001,
type=float,
)
self.cli_args = parser.parse_args(sys_argv)
# related to the hardware
self.use_cuda = torch.cuda.is_available()
self.device = torch.device("cuda" if self.use_cuda else "cpu")
# directly related to the neural network
self.model = self.initModel()
self.optimizer = self.initOptimizer()
# For early stopping
self.patience = 7
# For metrics
self.METRICS_LABELS_NDX = 0
self.METRICS_PREDS_NDX = 1
self.METRICS_LOSS_NDX = 2
self.METRICS_SIZE = 3
def initModel(self):
"""Initialize the model, if GPU available computation done there"""
model = VADNet()
model = model.double()
if self.use_cuda:
log.info("Using CUDA; {} devices".format(torch.cuda.device_count()))
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model = model.to(self.device)
return model
def initOptimizer(self):
return SGD(self.model.parameters(), lr=self.cli_args.lr)#, momentum=0.8, weight_decay=0.01)
def adjust_learning_rate(self):
"""Sets the learning rate to the initial LR decayed by a factor of 10 every 20 epochs"""
self.cli_args.lr = self.cli_args.lr * (0.1 ** (self.cli_args.epochs // 20))
for param_group in self.optimizer.param_groups:
param_group['lr'] = self.cli_args.lr
def initTrainDL(self):
trainingset = AudioDataset(self.cli_args.train_path,
n_fft=1024,
hop_length=376,
n_mels=128)
batch_size = self.cli_args.batch_size
if self.use_cuda:
batch_size *= torch.cuda.device_count()
trainLoader = DataLoader(trainingset,
batch_size = batch_size,
shuffle=True,
num_workers=self.cli_args.num_workers,
pin_memory=self.use_cuda)
return trainLoader
def initTestDL(self):
testset = AudioDataset(self.cli_args.test_path,
n_fft=1024,
hop_length=376,
n_mels=128)
batch_size = self.cli_args.batch_size
if self.use_cuda:
batch_size *= torch.cuda.device_count()
testLoader = DataLoader(testset,
batch_size = batch_size,
shuffle=True,
num_workers=self.cli_args.num_workers,
pin_memory=self.use_cuda)
return testLoader
def main(self):
log.info("Start training, {}".format(self.cli_args))
train_dl = self.initTrainDL()
test_dl = self.initTestDL()
trn_writer = SummaryWriter(log_dir='runs' + '-trn')
val_writer = SummaryWriter(log_dir='runs' + '-val')
early_stopping = EarlyStopping(patience=self.patience, path=self.cli_args.save_es, verbose=True)
for epoch_ndx in range(1, self.cli_args.epochs + 1):
log.info("Epoch {} / {}".format(epoch_ndx, self.cli_args.epochs))
# Adjust the new learning rate
self.adjust_learning_rate()
# Train the model's parameters
metrics_t = self.do_training(train_dl)
self.logMetrics(metrics_t, trn_writer, epoch_ndx)
# Test the model
metrics_v = self.do_val(test_dl, val_writer)
self.logMetrics(metrics_v, val_writer, epoch_ndx, train=False)
# Add the mean loss of the val for the epoch
early_stopping(metrics_v[self.METRICS_LOSS_NDX].mean(), self.model)
if early_stopping.early_stop:
print("Early stopping")
break
# Save the model once all epochs have been completed
torch.save(self.model.state_dict(), self.cli_args.save_path)
def do_training(self, train_dl):
"""Training loop"""
self.model.train()
# Initiate a 3 dimension tensor to store loss, labels and prediction
trn_metrics = torch.zeros(self.METRICS_SIZE, len(train_dl.dataset), device=self.device)
for batch_ndx, batch_tup in enumerate(train_dl):
if batch_ndx%100==0:
log.info("TRAINING --> Batch {} / {}".format(batch_ndx, len(train_dl)))
self.optimizer.zero_grad()
loss = self.ComputeBatchLoss(batch_ndx,
batch_tup,
self.cli_args.batch_size,
trn_metrics)
loss.backward()
self.optimizer.step()
return trn_metrics.to('cpu')
def do_val(self, test_dl, early_stop):
"""Validation loop"""
with torch.no_grad():
self.model.eval()
val_metrics = torch.zeros(self.METRICS_SIZE, len(test_dl.dataset), device=self.device)
for batch_ndx, batch_tup in enumerate(test_dl):
if batch_ndx%100==0:
log.info("VAL --> Batch {} / {}".format(batch_ndx, len(test_dl)))
loss = self.ComputeBatchLoss(batch_ndx,
batch_tup,
self.cli_args.batch_size,
val_metrics)
return val_metrics.to('cpu')
def ComputeBatchLoss(self, batch_ndx, batch_tup, batch_size, metrics_mat):
"""
Return a tensor the loss of the batch
"""
imgs, labels = batch_tup
imgs = imgs.to(device=self.device, non_blocking=True)
labels = labels.to(device=self.device, non_blocking=True)
outputs = self.model(imgs)
_, predicted = torch.max(outputs, dim=1)
loss_func = nn.CrossEntropyLoss(reduction="none")
loss = loss_func(outputs, labels)
start_ndx = batch_ndx * self.cli_args.batch_size
end_ndx = start_ndx + labels.size(0)
metrics_mat[self.METRICS_LABELS_NDX, start_ndx:end_ndx] = labels.detach()
metrics_mat[self.METRICS_PREDS_NDX, start_ndx:end_ndx] = predicted.detach()
metrics_mat[self.METRICS_LOSS_NDX, start_ndx:end_ndx] = loss.detach()
return loss.mean()
def logMetrics(self, metrics_mat, writer, epoch_ndx, train=True):
"""
Function to compute custom metrics: accurracy and recall for both classes
and % of correct predictions. Log the metrics in a tensorboard writer
"""
# Confusion matrix to compute precision / recall for each class
tn, fp, fn, tp = torch.tensor(confusion_matrix(metrics_mat[self.METRICS_LABELS_NDX],
metrics_mat[self.METRICS_PREDS_NDX],
labels=[0,1]).ravel())
precision_no_speech = tp / (tp + fp)
recall_no_speech = tp / (tp + fn)
# class speech is labelled 0, so true positive = true negative for speech
precision_speech = tn / (tn + fn)
recall_speech = tn / (fp + tn)
# % of correct predictions - optional metrics that are nice
no_speech_count = (metrics_mat[self.METRICS_LABELS_NDX] == 0).sum()
speech_count = (metrics_mat[self.METRICS_LABELS_NDX] == 1).sum()
no_speech_correct = ((metrics_mat[self.METRICS_PREDS_NDX] == 0) & (metrics_mat[self.METRICS_LABELS_NDX] == 0)).sum()
speech_correct = ((metrics_mat[self.METRICS_PREDS_NDX] == 1) & (metrics_mat[self.METRICS_LABELS_NDX] == 1)).sum()
correct_all = (speech_correct + no_speech_correct) / float(speech_count + no_speech_count) * 100
correct_speech = speech_correct / float(speech_count) * 100
correct_no_speech = no_speech_correct / float(no_speech_count) * 100
loss = metrics_mat[self.METRICS_LOSS_NDX].mean()
writer.add_scalar("loss", loss, epoch_ndx)
writer.add_scalar("precision/no_speech", precision_no_speech, epoch_ndx)
writer.add_scalar("recall/no_speech", recall_no_speech, epoch_ndx)
writer.add_scalar("precision/speech", precision_speech, epoch_ndx)
writer.add_scalar("recall/speech", recall_speech, epoch_ndx)
writer.add_scalar("correct/all", correct_all, epoch_ndx)
writer.add_scalar("correct/speech", correct_speech, epoch_ndx)
writer.add_scalar("correct/no_speech", correct_no_speech, epoch_ndx)
if train:
log.info("[TRAINING] loss: {}, correct/all: {}% , correct/speech: {}%, correct/no_speech: {}%".format(loss,
correct_all,
correct_speech,
correct_no_speech))
else:
log.info("[VAL] loss: {}, correct/all: {}% , correct/speech: {}%, correct/no_speech: {}%".format(loss,
correct_all,
correct_speech,
correct_no_speech))
if __name__ == "__main__":
VADTrainingApp().main()
关于模型,我使用了一个简单的 CNN:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
class VADNet(nn.Module):
def __init__(self, in_channels=1, conv_channels=8):
super().__init__()
self.tail_batchnorm = nn.BatchNorm2d(1)
self.block1 = ConvBlock(in_channels, conv_channels)
self.block2 = ConvBlock(conv_channels, conv_channels * 2)
self.block3 = ConvBlock(conv_channels * 2, conv_channels * 4)
self.block4 = ConvBlock(conv_channels * 4, conv_channels * 8)
self.head_linear = nn.Linear(8 * 8 * conv_channels * 8, 2)
self._init_weights()
def _init_weights(self):
for m in self.modules():
if type(m) in {
nn.Linear,
nn.Conv3d,
nn.Conv2d,
nn.ConvTranspose2d,
nn.ConvTranspose3d,
}:
nn.init.kaiming_normal_(
m.weight.data, a=0, mode='fan_out', nonlinearity='relu',
)
if m.bias is not None:
fan_in, fan_out = \
nn.init._calculate_fan_in_and_fan_out(m.weight.data)
bound = 1 / math.sqrt(fan_out)
nn.init.normal_(m.bias, -bound, bound)
def forward(self, input_batch):
bn_output = self.tail_batchnorm(input_batch)
block_out = self.block1(bn_output)
block_out = self.block2(block_out)
block_out = self.block3(block_out)
block_out = self.block4(block_out)
conv_flat = block_out.view(block_out.size(0),-1)
linear_output = self.head_linear(conv_flat)
return linear_output
class ConvBlock(nn.Module):
def __init__(self, in_channels, conv_channels):
super().__init__()
self.conv1 = nn.Conv2d(
in_channels, conv_channels, kernel_size=3, padding=1, bias=True,
)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(
conv_channels, conv_channels, kernel_size=3, padding=1, bias=True,
)
self.relu2 = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(2, 2)
def forward(self, input_batch):
block_out = self.conv1(input_batch)
block_out = self.relu1(block_out)
block_out = self.conv2(block_out)
block_out = self.relu2(block_out)
return self.maxpool(block_out)
预先感谢您的帮助!
答案 0 :(得分:0)
再读一遍,让它沉没。
你现在明白是什么问题了吗?
卷积层学习静态/固定局部模式,并尝试在输入中任何地方匹配它。这对于您想要equivariant进行翻译并且所有像素具有相同“含义”的图像来说非常酷且方便。
但是,在频谱图中,不同的位置有不同的含义——频谱图顶部的像素表示高频,而较低的像素表示低频。 因此,如果您将某些局部模式与频谱图中的局部区域进行了匹配,如果它与频谱图的上部或下部匹配,则可能意味着完全不同的事情。 您需要一种不同类型的模型来处理频谱图。也许将频谱图转换为具有 128 个通道(频率)的一维信号并对其应用一维卷积?