我正在尝试使用PyTorch为聊天机器人训练双编码器LSTM模型。
我定义了两个类:Encoder类定义了LSTM本身,而Dual_Encoder类将Encoder应用于我想要训练的上下文和响应话语:
class Encoder(nn.Module):
def __init__(self,
input_size,
hidden_size,
vocab_size,
num_layers = 1,
num_directions = 1,
dropout = 0,
bidirectional = False,
rnn_type = 'lstm'):
super(Encoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.num_layers = 1
self.num_directions = 1
self.dropout = 0,
self.bidirectional = False
self.embedding = nn.Embedding(vocab_size, input_size, sparse = False, padding_idx = 0)
self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, batch_first=False, dropout = dropout, bidirectional=False).cuda()
self.init_weights()
def init_weights(self):
init.orthogonal(self.lstm.weight_ih_l0)
init.uniform(self.lstm.weight_hh_l0, a=-0.01, b=0.01)
embedding_weights = torch.FloatTensor(self.vocab_size, self.input_size).cuda()
init.uniform(embedding_weights, a = -0.25, b= 0.25)
id_to_vec, emb_dim = create_id_to_vec('/data/train_shuffled_onethousand.csv','/data/glove.6B.100d.txt')
for id, vec in id_to_vec.items():
embedding_weights[id] = vec
del self.embedding.weight
self.embedding.weight = nn.Parameter(embedding_weights)
self.embedding.weight.requires_grad = True
#self.embedding.weight.data.copy_(torch.from_numpy(self.embedding_weights))
def forward(self, inputs):
embeddings = self.embedding(inputs)
outputs, hiddens = self.lstm(embeddings)
return outputs, hiddens
#%%
class DualEncoder(nn.Module):
def __init__(self, encoder):
super(DualEncoder, self).__init__()
self.encoder = encoder
self.number_of_layers = 1
#h_0 (num_layers * num_directions, batch, hidden_size):
#tensor containing the initial hidden state for each element in the batch.
#dual_hidden_size = self.encoder.hidden_size * self.encoder.num_directions
M = torch.FloatTensor(self.encoder.hidden_size, self.encoder.hidden_size).cuda()
init.normal(M)
self.M = nn.Parameter(M, requires_grad = True)
def forward(self, contexts, responses):
#output (seq_len, batch, hidden_size * num_directions):
#tensor containing the output features (h_t) from the last layer
#of the RNN, for each t.
#h_n (num_layers * num_directions, batch, hidden_size):
#tensor containing the hidden state for t=seq_len
context_out, context_hn = self.encoder(contexts)
response_out, response_hn = self.encoder(responses)
scores_list = []
y_preds = None
for e in range(999):
context_h = context_out[e][-1].view(1, self.encoder.hidden_size)
response_h = response_out[e][-1].view(self.encoder.hidden_size,1)
dot_var = torch.mm(torch.mm(context_h, self.M), response_h)[0][0]
dot_tensor = dot_var.data
dot_tensor.cuda()
score = torch.sigmoid(dot_tensor)
scores_list.append(score)
y_preds_tensor = torch.stack(scores_list).cuda()
y_preds = autograd.Variable(y_preds_tensor).cuda()
return y_preds
#%% TRAINING
torch.backends.cudnn.enabled = False
#%%
vocab = create_vocab('/data/train_shuffled_onethousand.csv')
vocab_len = len(vocab)
emb_dim = get_emb_dim('/data/glove.6B.100d.txt')
#%%
encoder_model = Encoder(
input_size = emb_dim,
hidden_size = 300,
vocab_size = vocab_len)
encoder_model.cuda()
#%%
dual_encoder = DualEncoder(encoder_model)
dual_encoder.cuda()
#%%
loss_func = torch.nn.BCELoss()
loss_func.cuda()
learning_rate = 0.001
epochs = 100
#batch_size = 50
optimizer = optim.Adam(dual_encoder.parameters(),
lr = learning_rate)
#%%
for i in range(epochs):
context_matrix, response_matrix, y = make_matrices('/data/train_shuffled_onethousand.csv')
context_matrix = autograd.Variable(context_matrix, requires_grad=True).cuda()
response_matrix = autograd.Variable(response_matrix, requires_grad=True).cuda()
y_label = y.cuda()
y_preds = dual_encoder(context_matrix, response_matrix)
loss = loss_func(y_preds, y_label)
if i % 10 == 0:
print("Epoch: ", i, ", Loss: ", loss.data[0])
#evaluation metrics...
dual_encoder.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm(dual_encoder.parameters(), 10)
optimizer.step()
发生以下错误:
2018-01-06 06:07:02,148 INFO - result = self.forward(*input, **kwargs)
2018-01-06 06:07:02,148 INFO - File "all_scripts.py", line 258, in forward
2018-01-06 06:07:02,148 INFO - context_out, context_hn = self.encoder(contexts)
2018-01-06 06:07:02,149 INFO - File "/usr/local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 325, in __call__
2018-01-06 06:07:02,149 INFO - result = self.forward(*input, **kwargs)
2018-01-06 06:07:02,149 INFO - File "all_scripts.py", line 229, in forward
2018-01-06 06:07:02,149 INFO - embeddings = self.embedding(inputs)
2018-01-06 06:07:02,150 INFO - File "/usr/local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 325, in __call__
2018-01-06 06:07:02,150 INFO - result = self.forward(*input, **kwargs)
2018-01-06 06:07:02,150 INFO - File "/usr/local/lib/python3.6/site-packages/torch/nn/modules/sparse.py", line 103, in forward
2018-01-06 06:07:02,150 INFO - self.scale_grad_by_freq, self.sparse
2018-01-06 06:07:02,150 INFO - File "/usr/local/lib/python3.6/site-packages/torch/nn/_functions/thnn/sparse.py", line 40, in forward
2018-01-06 06:07:02,151 INFO - assert not ctx.needs_input_grad[0], "Embedding doesn't " \
2018-01-06 06:07:02,151 INFO - AssertionError: Embedding doesn't compute the gradient w.r.t. the indices
我确实理解为什么会出现这个问题(确定计算渐变w.r.t.指数是没有意义的)。 但我不明白如何调整代码,以便它计算渐变w.r.t.嵌入向量的内容值。
所有帮助都非常感谢!
(另见thread in the PyTorch论坛)
答案 0 :(得分:1)
经过一些大范围的调整后,代码现在可以运行了。问题不仅在于嵌入初始化。有关改进的代码,请参阅my github repo。
答案 1 :(得分:0)
我不知道你为什么要删除嵌入层的权重。如果要初始化嵌入权重,则应执行以下操作:
self.embedding.weight.data.copy_(torch.from_numpy(self.embedding_weights))
您已注释掉上述行,但这是初始化嵌入权重的最佳方法。