我目前正在尝试运行SEGAN进行语音增强,但由于运行以下错误,因此似乎无法使网络开始培训:
运行时错误:CUDA内存不足:试图分配30.00 MiB(GPU 0; 3.00 GiB总容量; 2.00 GiB已分配; 5.91 MiB可用; PyTorch总共保留2.03 GiB >
我已经尝试包含torch.cuda.empty_cache()了,但这似乎并没有解决问题
这是我当前正在运行的脚本
import argparse
import os
import torch
import torch.nn as nn
from scipy.io import wavfile
from torch import optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from tqdm import tqdm
from data_preprocess import sample_rate
from model import Generator, Discriminator
from utils import AudioDataset, emphasis
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Train Audio Enhancement')
parser.add_argument('--batch_size', default=50, type=int, help='train batch size')
parser.add_argument('--num_epochs', default=86, type=int, help='train epochs number')
opt = parser.parse_args()
BATCH_SIZE = opt.batch_size
NUM_EPOCHS = opt.num_epochs
# load data
torch.cuda.empty_cache()
print('loading data...')
train_dataset = AudioDataset(data_type='train')
test_dataset = AudioDataset(data_type='test')
train_data_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
test_data_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
# generate reference batch
ref_batch = train_dataset.reference_batch(BATCH_SIZE)
# create D and G instances
discriminator = Discriminator()
generator = Generator()
if torch.cuda.is_available():
discriminator.cuda()
generator.cuda()
ref_batch = ref_batch.cuda()
ref_batch = Variable(ref_batch)
print("# generator parameters:", sum(param.numel() for param in generator.parameters()))
print("# discriminator parameters:", sum(param.numel() for param in discriminator.parameters()))
# optimizers
g_optimizer = optim.RMSprop(generator.parameters(), lr=0.0001)
d_optimizer = optim.RMSprop(discriminator.parameters(), lr=0.0001)
for epoch in range(NUM_EPOCHS):
train_bar = tqdm(train_data_loader)
for train_batch, train_clean, train_noisy in train_bar:
# latent vector - normal distribution
z = nn.init.normal(torch.Tensor(train_batch.size(0), 1024, 8))
if torch.cuda.is_available():
train_batch, train_clean, train_noisy = train_batch.cuda(), train_clean.cuda(), train_noisy.cuda()
z = z.cuda()
train_batch, train_clean, train_noisy = Variable(train_batch), Variable(train_clean), Variable(train_noisy)
z = Variable(z)
# TRAIN D to recognize clean audio as clean
# training batch pass
discriminator.zero_grad()
outputs = discriminator(train_batch, ref_batch)
clean_loss = torch.mean((outputs - 1.0) ** 2) # L2 loss - we want them all to be 1
clean_loss.backward()
# TRAIN D to recognize generated audio as noisy
generated_outputs = generator(train_noisy, z)
outputs = discriminator(torch.cat((generated_outputs, train_noisy), dim=1), ref_batch)
noisy_loss = torch.mean(outputs ** 2) # L2 loss - we want them all to be 0
noisy_loss.backward()
# d_loss = clean_loss + noisy_loss
d_optimizer.step() # update parameters
# TRAIN G so that D recognizes G(z) as real
generator.zero_grad()
generated_outputs = generator(train_noisy, z)
gen_noise_pair = torch.cat((generated_outputs, train_noisy), dim=1)
outputs = discriminator(gen_noise_pair, ref_batch)
g_loss_ = 0.5 * torch.mean((outputs - 1.0) ** 2)
# L1 loss between generated output and clean sample
l1_dist = torch.abs(torch.add(generated_outputs, torch.neg(train_clean)))
g_cond_loss = 100 * torch.mean(l1_dist) # conditional loss
g_loss = g_loss_ + g_cond_loss
# backprop + optimize
g_loss.backward()
g_optimizer.step()
train_bar.set_description(
'Epoch {}: d_clean_loss {:.4f}, d_noisy_loss {:.4f}, g_loss {:.4f}, g_conditional_loss {:.4f}'
.format(epoch + 1, clean_loss.data[0], noisy_loss.data[0], g_loss.data[0], g_cond_loss.data[0]))
# TEST model
test_bar = tqdm(test_data_loader, desc='Test model and save generated audios')
for test_file_names, test_noisy in test_bar:
z = nn.init.normal(torch.Tensor(test_noisy.size(0), 1024, 8))
if torch.cuda.is_available():
test_noisy, z = test_noisy.cuda(), z.cuda()
test_noisy, z = Variable(test_noisy), Variable(z)
fake_speech = generator(test_noisy, z).data.cpu().numpy() # convert to numpy array
fake_speech = emphasis(fake_speech, emph_coeff=0.95, pre=False)
for idx in range(fake_speech.shape[0]):
generated_sample = fake_speech[idx]
file_name = os.path.join('results',
'{}_e{}.wav'.format(test_file_names[idx].replace('.npy', ''), epoch + 1))
wavfile.write(file_name, sample_rate, generated_sample.T)
# save the model parameters for each epoch
g_path = os.path.join('epochs', 'generator-{}.pkl'.format(epoch + 1))
d_path = os.path.join('epochs', 'discriminator-{}.pkl'.format(epoch + 1))
torch.save(generator.state_dict(), g_path)
torch.save(discriminator.state_dict(), d_path)
答案 0 :(得分:1)
尝试减小批次大小(如提到的David S)。还可以使用with torch.no_grad():
语句在不进行梯度计算的情况下运行测试。如果您希望以更大的批次进行训练,而您的内存不足,则一种解决方案是使用梯度累积。
答案 1 :(得分:1)
您可以做一些事情:
with torch.no_grad()
,因为它不会在内存中保留渐变,从而节省了CUDA内存您需要PyTorch图层(1.6.0
)才能轻松实现。选中documentation。
在可行的情况下(例如torch.nn.Conv2d
),参数将被强制转换为float16
,这样可以加快训练速度,并需要较少的内存(在某些情况下,像BatchNorm
这样的层将保留为{ {1}}(由于运行平均值)。
在鉴别器和生成器的情况下,请查看第Working with Multiple Models, Losses, and Optimizers节。
用于多个模型的示例代码(有关更多信息及其确切工作方式,请参考文档):
float32
(要点是执行单个scaler = torch.cuda.amp.GradScaler()
for epoch in epochs:
for input, target in data:
optimizer0.zero_grad()
optimizer1.zero_grad()
with autocast():
output0 = model0(input)
output1 = model1(input)
loss0 = loss_fn(2 * output0 + 3 * output1, target)
loss1 = loss_fn(3 * output0 - 5 * output1, target)
scaler.scale(loss0).backward(retain_graph=True)
scaler.scale(loss1).backward()
# You can choose which optimizers receive explicit unscaling, if you
# want to inspect or modify the gradients of the params they own.
scaler.unscale_(optimizer0)
scaler.step(optimizer0)
scaler.step(optimizer1)
scaler.update()
更新)。