PyTorch LSTM没有在培训中学习

时间:2020-09-28 13:58:26

标签: python pytorch

我有以下简单的LSTM网络:

class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.rnn = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.batch_size = None
        self.hidden = None
    
    def forward(self, x):
        h0, c0 = self.init_hidden(x)
        out, (hn, cn) = self.rnn(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out
    
    def init_hidden(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
        return [t for t in (h0, c0)]

我将此模型初始化为“

model = LSTMClassifier(28, 10, 6, 1)

即每个输入实例具有6个时间步长,每个时间步长的维数为28,隐藏维数为10。输入被映射为1的输出暗度。

正在按大小16的批次准备训练数据,这意味着在训练循环中传递的数据具有以下形状:

torch.Size([16, 6, 28])

带有形状标签:

batches[1][0].size()

输入示例:

tensor([[-0.3674,  0.0347, -0.2169, -0.0821, -0.3673, -0.1773,  1.1840, -0.2669,
         -0.4202, -0.1473, -0.1132, -0.4756, -0.3565,  0.5010,  0.1274, -0.1147,
          0.2783,  0.0836, -1.3251, -0.8067, -0.6447, -0.7396, -0.3241,  1.3329,
          1.3801,  0.8198,  0.6098,  0.0697],
        [-0.2710,  0.1596, -0.2524, -0.0821, -0.3673, -0.1773,  0.0302, -0.2099,
         -0.4550,  0.1451, -0.4561, -0.5207, -0.5657, -0.5287, -0.2690, -0.1147,
         -0.0346, -0.1043, -0.7515, -0.8392, -0.4745, -0.7396, -0.3924,  0.8122,
         -0.1624, -1.2198,  0.0326, -0.9306],
        [-0.1746,  0.0972, -0.2702, -0.0821, -0.3673, -0.1773, -0.0468, -1.1225,
         -0.4480, -0.4397,  0.4011, -1.1073, -1.0536, -0.1855, -0.7502, -0.1147,
         -0.0146, -0.1545, -0.1919, -0.1674,  0.0930, -0.7396,  0.8106,  1.1594,
          0.4546, -1.2198, -0.5446, -1.2640],
        [-0.2710,  0.0660, -0.2524, -0.0821, -0.4210, -0.1773,  1.8251, -0.5236,
         -0.4410, -0.7321,  0.4011, -0.6110, -0.2171,  1.1875, -0.2973, -0.1147,
         -0.1278,  0.7728, -0.9334, -0.5141, -2.1202,  1.3521, -0.9393,  0.5085,
         -0.4709,  0.8198, -1.1218,  0.0697],
        [-0.3674, -0.0277, -0.2347, -0.0821, -0.0448, -0.1773,  0.2866, -0.1386,
         -0.4271,  0.4375, -0.2847, -0.1146, -0.4262, -0.3571, -0.0425, -0.1147,
         -0.4207, -0.4552, -0.5277, -0.9584, -0.4177, -0.7396, -0.2967,  0.5085,
          0.4546, -1.2198, -0.3522, -1.2640],
        [-0.3674, -0.1447, -0.1991, -0.0821,  0.1701, -0.1773,  0.0430,  0.1324,
         -0.4271,  0.7299, -0.4561,  0.2915, -0.5657, -0.1855, -0.2123, -0.1147,
         -0.0413, -0.8311, -0.6396, -1.0451, -0.4177, -0.7396, -0.2967, -0.4028,
          0.7631, -1.2198, -0.3522, -1.2640]])

当我将模型训练为:

Epochs = 10
batch_size = 32
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
for epoch in range(Epochs):
    print(f"Epoch {epoch + 1}")
    for n, (X, y) in enumerate(batches):
        model.train()
        optimizer.zero_grad()
        y_pred = model(X)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer.step()


    model.eval()
    accurate = 0
    for X_instance, y_instance in zip(test_X, test_y):
        if y_instance == round(model(X_instance.view(-1, 6, 28)).detach().item()):
            accurate += 1
    print(f"Accuracy test set: {accurate/len(test_X)}")

精度不能收敛:

Epoch 1
Accuracy test set: 0.23169107856191745
Sample params: 
tensor([-0.3356, -0.0105, -0.3405, -0.0049,  0.0037,  0.1707,  0.2685, -0.3893,
        -0.4707, -0.2872, -0.1544, -0.1455,  0.0393,  0.0774, -0.4194,  0.0780,
        -0.2177, -0.3829, -0.4679,  0.0370, -0.0794,  0.0455, -0.1331, -0.0169,
        -0.1551, -0.0348,  0.1746, -0.5163], grad_fn=<SelectBackward>)
tensor([ 0.2137, -0.2558,  0.1509, -0.0975,  0.5591,  0.0907, -0.1249,  0.3095,
         0.2112,  0.3134, -0.1581, -0.3051, -0.3559, -0.0177,  0.1485,  0.4397,
        -0.1441,  0.1705,  0.3230, -0.3236,  0.0692,  0.0920, -0.2691, -0.3695,
        -0.0692,  0.3747,  0.0149,  0.5216], grad_fn=<SelectBackward>)
Epoch 2
Accuracy test set: 0.23049267643142476
Sample params: 
tensor([-0.3483, -0.0144, -0.3512,  0.0213, -0.0081,  0.1777,  0.2674, -0.4031,
        -0.4628, -0.3041, -0.1651, -0.1511,  0.0216,  0.0513, -0.4320,  0.0839,
        -0.2602, -0.3629, -0.4541,  0.0398, -0.0768,  0.0432, -0.1150, -0.0160,
        -0.1346, -0.0727,  0.1801, -0.5253], grad_fn=<SelectBackward>)
tensor([ 0.1879, -0.2534,  0.1461, -0.1141,  0.5735,  0.0872, -0.1286,  0.3273,
         0.2084,  0.3037, -0.1535, -0.2934, -0.3870, -0.0252,  0.1492,  0.4752,
        -0.1709,  0.1776,  0.3390, -0.3318,  0.0734,  0.1077, -0.2790, -0.3777,
        -0.0518,  0.3726,  0.0228,  0.5404], grad_fn=<SelectBackward>)
Epoch 3
Accuracy test set: 0.22982689747003995
Sample params: 
tensor([-0.3725, -0.0069, -0.3623,  0.0393, -0.0167,  0.1748,  0.2577, -0.4183,
        -0.4681, -0.3196, -0.1657, -0.1613,  0.0122,  0.0268, -0.4361,  0.0838,
        -0.2962, -0.3566, -0.4344,  0.0366, -0.0822,  0.0486, -0.1150, -0.0295,
        -0.1080, -0.1094,  0.1841, -0.5336], grad_fn=<SelectBackward>)
tensor([ 0.1664, -0.2456,  0.1477, -0.1332,  0.5820,  0.0819, -0.1228,  0.3426,
         0.2066,  0.2985, -0.1464, -0.2824, -0.4199, -0.0323,  0.1530,  0.5057,
        -0.1991,  0.1856,  0.3407, -0.3347,  0.0800,  0.1203, -0.2791, -0.3863,
        -0.0426,  0.3760,  0.0327,  0.5641], grad_fn=<SelectBackward>)
Epoch 4
Accuracy test set: 0.23249001331557922
Sample params: 
tensor([-0.3945,  0.0032, -0.3765,  0.0600, -0.0248,  0.1713,  0.2442, -0.4297,
        -0.4741, -0.3311, -0.1653, -0.1667,  0.0029,  0.0066, -0.4373,  0.0738,
        -0.3320, -0.3530, -0.4136,  0.0390, -0.0731,  0.0552, -0.1117, -0.0517,
        -0.0871, -0.1455,  0.1841, -0.5359], grad_fn=<SelectBackward>)
tensor([ 0.1495, -0.2292,  0.1524, -0.1473,  0.5938,  0.0661, -0.1157,  0.3626,
         0.2013,  0.2927, -0.1350, -0.2661, -0.4558, -0.0411,  0.1562,  0.5381,
        -0.2279,  0.1927,  0.3319, -0.3431,  0.0852,  0.1402, -0.2747, -0.4026,
        -0.0297,  0.3757,  0.0396,  0.5856], grad_fn=<SelectBackward>)

我在模型定义上犯了一个错误吗?

1 个答案:

答案 0 :(得分:2)

因此,通常,LSTM中的6层可以达到很多目的。输入尺寸为28(您是在训练MNIST,还是输入字母?),所以10作为隐藏尺寸在本质上会变小。尝试以下参数:

hidden_dim = 128 to 512
layer_dim = 2 to max. 4

我看到您的输出形状为1,并且您未使用激活函数。您是否要预测整数(例如,“狗”类别为1,“猫”类别为2)?如果是这样,您应该切换到单热编码,以便您的输出形状等于您要预测的类。然后使用softmax作为最后一层的激活。