我试图训练卷积自动编码器来编码和解码单声道midi剪辑的钢琴卷表示。我将音符范围缩小到3个八度音阶,将歌曲分成100个时间步长片(其中1个时间步长= 1/100秒),并分批训练3个片段。
我使用Adagrad作为我的优化器,MSE作为我的损失功能。损失是巨大的,即使数百个训练样例被输入,我也看不到平均损失的减少。
这是我的代码:
"""
Most absolutely simple assumptions:
- not changing the key of any of the files
- not changing the tempo of any of the files
- take blocks of 36 by 100
- divide up all songs by this amount, cutting off any excess from the
end, train
"""
from __future__ import print_function
import cPickle as pickle
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from reverse_pianoroll import piano_roll_to_pretty_midi as pr2pm
N = 1000
# load a NxMxC dataset
# N: Number of clips
# M: Piano roll size, the number of midi notes that could possibly be 'on'
# C: Clip length, in 100ths of a second
dataset = pickle.load(open('mh-midi-data.pickle', 'rb'))
######## take a subset of the data for training ######
# based on the mean and standard deviation of non zero entries in the data, I've
# found that the most populous, and thus best range of notes to take is from
# 48 to 84 (C2 - C5); this is 3 octaves, which is much less than the original
# 10 and a half. Additionally, we're going to take a subsample of 1000 because
# i'm training on my macbook and the network is pretty simple
######################################################
dataset = dataset[:, :, 48:84, :]
dataset = dataset[:N]
######################################################
midi_dim, clip_len = dataset.shape[2:]
class Autoencoder(nn.Module):
def __init__(self, **kwargs):
super(Autoencoder, self).__init__(**kwargs)
# input is 3 x 1 x 36 x 100
self.conv1 = nn.Conv2d(in_channels=1, out_channels=14, kernel_size=(midi_dim, 2))
# now transformed to 3 x 14 x 1 x 99
self.conv2 = nn.Conv2d(in_channels=14, out_channels=77, kernel_size=(1, 4))
# now transformed to 3 x 77 x 1 x 96
input_size = 3*77*1*96
self.fc1 = nn.Linear(input_size, input_size/2)
self.fc2 = nn.Linear(input_size/2, input_size/4)
self.fc3 = nn.Linear(input_size/4, input_size/2)
self.fc4 = nn.Linear(input_size/2, input_size)
self.tconv2 = nn.ConvTranspose2d(in_channels=77, out_channels=14, kernel_size=(1, 4))
self.tconv1 = nn.ConvTranspose2d(in_channels=14, out_channels=1, kernel_size=(midi_dim, 2))
self.sigmoid = nn.Sigmoid()
return
def forward(self, x):
# print("1: {}".format(x.size()))
x = F.relu(self.conv1(x))
# print("2: {}".format(x.size()))
x = F.relu(self.conv2(x))
# print("3: {}".format(x.size()))
x = x.view(-1, np.prod(x.size()[:]))
# print("4: {}".format(x.size()))
x = F.relu(self.fc1(x))
# print("5: {}".format(x.size()))
h = F.relu(self.fc2(x))
# print("6: {}".format(h.size()))
d = F.relu(self.fc3(h))
# print("7: {}".format(d.size()))
d = F.relu(self.fc4(d))
# print("8: {}".format(d.size()))
d = d.view(3, 77, 1, 96)
# print("9: {}".format(d.size()))
d = F.relu(self.tconv2(d))
# print("10: {}".format(d.size()))
d = self.tconv1(d)
d = self.sigmoid(d)
# print("11: {}".format(d.size()))
return d
net = Autoencoder()
loss_fn = nn.MSELoss()
# optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9)
optimizer = optim.Adagrad(net.parameters(), lr=1e-3)
batch_count = 0
avg_loss = 0.0
print_every = 3
print("Beginning Training")
for epoch in xrange(2):
# for i, clip in enumerate(dataset):
for i in xrange(len(dataset)/3):
batch = dataset[(3*i):(3*i + 3), :, :]
# get the input, wrap it in a Variable
inpt = Variable(torch.from_numpy(batch).type(torch.FloatTensor))
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outpt = net(inpt)
loss = loss_fn(outpt, inpt)
loss.backward()
optimizer.step()
# print stats out
avg_loss += loss.data[0]
if batch_count % print_every == print_every - 1:
print('epoch: %d, batch_count: %d, loss: %.3f'%(
epoch + 1, batch_count + 1, avg_loss / print_every))
avg_loss = 0.0
batch_count += 1
print('Finished Training')
我真的是这个东西的初学者,所以任何建议都会非常感激。
答案 0 :(得分:2)
仔细检查您将inpt规范化为0到1.例如,如果您正在使用图像,则可以将inpt变量除以255.