最近,当我浏览此Pytorch word embedding turtorial时,我注意到词汇顺序将对预测结果产生影响。
下面是一个解释该问题的示例代码,该代码是从Robert Guthrie的先前代码中修改的。
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as functional
import torch.optim as optim
torch.manual_seed(1)
CONTEXT_SIZE = 2
EMBEDDING_DIM = 4
test_sentence = r"""<s> The mathematician ran . <\s>
<s> The mathematician ran to the store . <\s>
<s> The physicist ran to the store . <\s>
<s> The philosopher thought about it . <\s>
<s> The mathematician solved the open problem . <\s>""".split()
# build a list of tuples. Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
for i in range(len(test_sentence) - 2)]
# fix the order of vocabulary
# if the sorted() is removed, the predicting result will be unstable.
vocab = sorted(list(set(test_sentence)))
word_to_ix = {word: i for i, word in enumerate(vocab)}
class NGramLanguageModeler(nn.Module):
def __init__(self, vocab_size, embedding_dim, context_size):
super(NGramLanguageModeler, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(context_size * embedding_dim, 128)
self.linear2 = nn.Linear(128, vocab_size)
def forward(self, inputs):
embeds = self.embeddings(inputs).view((1, -1))
out = functional.relu(self.linear1(embeds))
out = self.linear2(out)
log_probs = functional.log_softmax(out, dim=1)
return log_probs
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.01)
# train the model
for epoch in range(20):
for context, target in trigrams:
context_indices = [word_to_ix[w] for w in context]
context_var = autograd.Variable(torch.LongTensor(context_indices))
model.zero_grad()
log_probs = model(context_var)
loss = loss_function(log_probs, autograd.Variable(torch.LongTensor([word_to_ix[target]])))
loss.backward()
optimizer.step()
context_tuple = ("<s>", "The")
context_indices = [word_to_ix[w] for w in context_tuple]
context_var = autograd.Variable(torch.LongTensor(context_indices))
model.zero_grad()
log_probs = model(context_var)
sims = []
probs = []
candidates = ["philosopher", "physicist"]
# to calculate which word is closer to mathematician according to cosine similarities
related_embedding = model.embeddings(autograd.Variable(torch.LongTensor([word_to_ix["mathematician"]])))
for word in candidates:
# Probability
probs.append(log_probs[0][word_to_ix[word]])
# Cosine similarity
embedding = model.embeddings(autograd.Variable(torch.LongTensor([word_to_ix[word]])))
sims.append(functional.cosine_similarity(embedding, related_embedding))
print("Predicted word (probability): %s" % (candidates[0] if probs[0] > probs[1] else candidates[1]))
print("Predicted word (cosine similarity): %s" % (candidates[0] if sims[0] > sims[1] else candidates[1]))
如果删除了vocab
变量的sorted()函数,结果将有所不同。既然我已经修复了Pytorch的随机种子,为什么结果不可再现?