我正在一次又一次地在一个非常小的数据集(大小= 2)上训练我的网络。它在开始时看起来很好但在Epoch 34
之后损失就会停止下来。
我虽然可能是由Dying Relu
问题引起的,所以我用Softplus
替换了所有relu激活。但问题仍然存在。
我还尝试添加一些与Softplus结合的线性隐藏图层,但没有结果。
如果我尝试使用只有一个示例的数据集,那么损失会按预期变为0。
什么可能导致此行为以及如何避免它?
Epoch 0
Expected: [243.0, 0.0]; Predicted: [-2367.9, 3.8]; Loss: 6059457.6000
Expected: [178.0, 32.0]; Predicted: [-1731.4, 10.9]; Loss: 3241238.0000
Epoch 1
Expected: [243.0, 0.0]; Predicted: [-1237.8, 8.1]; Loss: 1949257.6000
Expected: [178.0, 32.0]; Predicted: [-883.9, 14.1]; Loss: 1002567.6000
Epoch 2
Expected: [243.0, 0.0]; Predicted: [-602.2, 10.6]; Loss: 635017.6000
Expected: [178.0, 32.0]; Predicted: [-407.1, 15.9]; Loss: 304548.4500
...
Epoch 12
Expected: [243.0, 0.0]; Predicted: [212.6, 13.7]; Loss: 991.0653
Expected: [178.0, 32.0]; Predicted: [203.9, 18.3]; Loss: 764.2527
Epoch 13
Expected: [243.0, 0.0]; Predicted: [213.7, 13.7]; Loss: 930.9330
Expected: [178.0, 32.0]; Predicted: [204.8, 18.3]; Loss: 803.9944
...
Epoch 32
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9812
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9783
Epoch 33
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9806
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9789
Epoch 34
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 35
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 36
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 37
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 38
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 39
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 40
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 41
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
Epoch 42
Expected: [243.0, 0.0]; Predicted: [215.1, 13.7]; Loss: 856.9800
Expected: [178.0, 32.0]; Predicted: [205.9, 18.3]; Loss: 856.9795
以下是代码:
class NN(Module):
def __init__(self, vocab_size, *_, hidden_size=200, text_max_len=200, rnn_num_layers=1, dropout=0.2):
super(NN, self).__init__()
self.rnn_layer_range = list(range(rnn_num_layers))
self.hidden_size = hidden_size
self.text_max_len = text_max_len
self.softplus = Softplus()
# text:
self.text_embed = Embedding(vocab_size, hidden_size, padding_idx=0)
self.text_conv1 = Conv2d( 1, 6, 5)
self.text_conv2 = Conv2d( 6, 12, 5)
self.text_conv3 = Conv2d(12, 24, 5)
self.text_lin1 = Linear(4536, hidden_size)
# image:
self.img_conv1 = Conv2d( 3, 6, 5)
self.img_conv2 = Conv2d( 6, 16, 5)
self.img_conv3 = Conv2d(16, 20, 5)
self.img_lin1 = Linear(2420, hidden_size)
# union:
self.u_size = 3*hidden_size
self.u_linear_augment = Linear(9, hidden_size)
self.u_gru = GRU(input_size=self.u_size, hidden_size=self.u_size, dropout=dropout)
self.u_linear_reduce1 = Linear(self.u_size, self.u_size // 2)
self.u_linear_reduce2 = Linear(self.u_size // 2, 2)
def initHidden(self):
return Variable(zeros(1, 1, self.hidden_size)), Variable(zeros(1, 1, self.u_size))
def forward(self, text, img, data, *_, text_hidden=None, u_hidden=None):
text_hidden, u_hidden = self.initHidden()
# encode text
max_len = self.text_max_len
if len(text) < max_len:
text = cat((text, Variable(LongTensor(max_len - len(text)).zero_())))
text = self.text_embed(text)
text = text.view(1, 1, max_len, self.hidden_size)
text = max_pool2d(self.softplus(self.text_conv1(text)), 2)
text = max_pool2d(self.softplus(self.text_conv2(text)), 2)
text = max_pool2d(self.softplus(self.text_conv3(text)), 2)
text = text.view(1, -1)
text = self.softplus(self.text_lin1(text))
text = text.view(-1, 1)
# encode image
img = max_pool2d(self.softplus(self.img_conv1(img)), 2)
img = max_pool2d(self.softplus(self.img_conv2(img)), 2)
img = max_pool2d(self.softplus(self.img_conv3(img)), 2)
img = img.view(1, -1)
img = self.softplus(self.img_lin1(img))
img = img.view(-1, 1)
# join
data = self.softplus(self.u_linear_augment(data))
vector = text.view(1, 1, -1)
vector = cat((data, text, img)).view(1, 1, -1)
for _ in self.rnn_layer_range:
vector, u_hidden = self.u_gru(vector, u_hidden)
vector = self.softplus(self.u_linear_reduce1(vector))
vector = self.u_linear_reduce2(vector)
return vector.view(-1, 1)
和
def train(neuron_network, optimizer, criterion, text, img, data, target, *_, loop_size=5):
optimizer.zero_grad()
loss = 0
for _ in range(loop_size):
output = neuron_network(text, img, data)
loss += criterion(output, target)
loss.backward()
optimizer.step()
return loss.data[0]/loop_size
以下是我训练它的方式:
neural = NN(vocab_size=len(letter_dict)+1, dropout=0, rnn_num_layers=1, hidden_size=100)
optimizer = optim.SGD(neural.parameters(), lr=0.01)
criterion = nn.MSELoss()
for epoch in range(500):
for item in dataset:
loss = train(neural, optimizer, criterion, item["text"], item["img"], item["data"], item["target"])