Question

我试图训练卷积自动编码器来编码和解码单声道midi剪辑的钢琴卷表示。我将音符范围缩小到3个八度音阶，将歌曲分成100个时间步长片（其中1个时间步长= 1/100秒），并分批训练3个片段。

我使用Adagrad作为我的优化器，MSE作为我的损失功能。损失是巨大的，即使数百个训练样例被输入，我也看不到平均损失的减少。

这是我的代码：

"""
Most absolutely simple assumptions:
  - not changing the key of any of the files
  - not changing the tempo of any of the files

- take blocks of 36 by 100
- divide up all songs by this amount, cutting off any excess from the 
end, train
"""
from __future__ import print_function
import cPickle as pickle
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from reverse_pianoroll import piano_roll_to_pretty_midi as pr2pm

N = 1000
# load a NxMxC dataset
    # N: Number of clips
    # M: Piano roll size, the number of midi notes that could possibly be 'on'
    # C: Clip length, in 100ths of a second
dataset = pickle.load(open('mh-midi-data.pickle', 'rb'))
######## take a subset of the data for training ######
# based on the mean and standard deviation of non zero entries in the data, I've
# found that the most populous, and thus best range of notes to take is from
# 48 to 84 (C2 - C5); this is 3 octaves, which is much less than the original
# 10 and a half. Additionally, we're going to take a subsample of 1000 because
# i'm training on my macbook and the network is pretty simple
######################################################
dataset = dataset[:, :, 48:84, :]
dataset = dataset[:N]
######################################################

midi_dim, clip_len = dataset.shape[2:]

class Autoencoder(nn.Module):
    def __init__(self, **kwargs):
        super(Autoencoder, self).__init__(**kwargs)
        # input is 3 x 1 x 36 x 100
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=14, kernel_size=(midi_dim, 2))
        # now transformed to 3 x 14 x 1 x 99
        self.conv2 = nn.Conv2d(in_channels=14, out_channels=77, kernel_size=(1, 4))
        # now transformed to 3 x 77 x 1 x 96
        input_size = 3*77*1*96
        self.fc1 = nn.Linear(input_size, input_size/2)
        self.fc2 = nn.Linear(input_size/2, input_size/4)
        self.fc3 = nn.Linear(input_size/4, input_size/2)
        self.fc4 = nn.Linear(input_size/2, input_size)
        self.tconv2 = nn.ConvTranspose2d(in_channels=77, out_channels=14, kernel_size=(1, 4))
        self.tconv1 = nn.ConvTranspose2d(in_channels=14, out_channels=1, kernel_size=(midi_dim, 2))
        self.sigmoid = nn.Sigmoid()
        return

    def forward(self, x):
        # print("1: {}".format(x.size()))
        x = F.relu(self.conv1(x))
        # print("2: {}".format(x.size()))
        x = F.relu(self.conv2(x))
        # print("3: {}".format(x.size()))
        x = x.view(-1, np.prod(x.size()[:]))
        # print("4: {}".format(x.size()))
        x = F.relu(self.fc1(x))
        # print("5: {}".format(x.size()))
        h = F.relu(self.fc2(x))
        # print("6: {}".format(h.size()))
        d = F.relu(self.fc3(h))
        # print("7: {}".format(d.size()))
        d = F.relu(self.fc4(d))
        # print("8: {}".format(d.size()))
        d = d.view(3, 77, 1, 96)
        # print("9: {}".format(d.size()))
        d = F.relu(self.tconv2(d))
        # print("10: {}".format(d.size()))
        d = self.tconv1(d)
        d = self.sigmoid(d)
        # print("11: {}".format(d.size()))
        return d


net = Autoencoder()
loss_fn = nn.MSELoss()
# optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9)
optimizer = optim.Adagrad(net.parameters(), lr=1e-3)

batch_count = 0
avg_loss = 0.0
print_every = 3
print("Beginning Training")
for epoch in xrange(2):
    # for i, clip in enumerate(dataset):
    for i in xrange(len(dataset)/3):
        batch = dataset[(3*i):(3*i + 3), :, :]
        # get the input, wrap it in a Variable
        inpt = Variable(torch.from_numpy(batch).type(torch.FloatTensor))

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outpt = net(inpt)
        loss = loss_fn(outpt, inpt)
        loss.backward()
        optimizer.step()

        # print stats out
        avg_loss += loss.data[0]
        if batch_count % print_every == print_every - 1:
           print('epoch: %d, batch_count: %d, loss: %.3f'%(
                epoch + 1, batch_count + 1, avg_loss / print_every))
           avg_loss = 0.0
        batch_count += 1

print('Finished Training')

我真的是这个东西的初学者，所以任何建议都会非常感激。

Answer 1

仔细检查您将inpt规范化为0到1.例如，如果您正在使用图像，则可以将inpt变量除以255.

卷积自动编码器的损失不会减少

1 个答案: