我尝试仅使用GRU和线性层(完整的代码也位于https://www.kaggle.com/alvations/gru-language-model-not-training-properly)来重新实现简单的GRU语言模型:
class Generator(nn.Module):
def __init__(self, vocab_size, embedding_size, hidden_size, num_layers):
super(Generator, self).__init__()
# Initialize the embedding layer with the
# - size of input (i.e. no. of words in input vocab)
# - no. of hidden nodes in the embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
# Initialize the GRU with the
# - size of the input (i.e. embedding layer)
# - size of the hidden layer
self.gru = nn.GRU(embedding_size, hidden_size, num_layers)
# Initialize the "classifier" layer to map the RNN outputs
# to the vocabulary. Remember we need to -1 because the
# vectorized sentence we left out one token for both x and y:
# - size of hidden_size of the GRU output.
# - size of vocabulary
self.classifier = nn.Linear(hidden_size, vocab_size)
def forward(self, inputs, use_softmax=False, hidden=None):
# Look up for the embeddings for the input word indices.
embedded = self.embedding(inputs)
# Put the embedded inputs into the GRU.
output, hidden = self.gru(embedded, hidden)
# Matrix manipulation magic.
batch_size, sequence_len, hidden_size = output.shape
# Technically, linear layer takes a 2-D matrix as input, so more manipulation...
output = output.contiguous().view(batch_size * sequence_len, hidden_size)
# Put it through the classifier
# And reshape it to [batch_size x sequence_len x vocab_size]
output = self.classifier(output).view(batch_size, sequence_len, -1)
return (F.softmax(output,dim=2), hidden) if use_softmax else (output, hidden)
def generate(self, max_len, temperature=1.0):
pass
以及训练程序:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Set the hidden_size of the GRU
embed_size = 100
hidden_size = 100
num_layers = 1
# Setup the data.
batch_size=50
kilgariff_data = KilgariffDataset(tokenized_text)
dataloader = DataLoader(dataset=kilgariff_data, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss(ignore_index=kilgariff_data.vocab.token2id['<pad>'], size_average=True)
model = Generator(len(kilgariff_data.vocab), embed_size, hidden_size, num_layers).to(device)
learning_rate = 0.003
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#model = nn.DataParallel(model)
losses = []
def train(num_epochs, dataloader, model, criterion, optimizer):
plt.ion()
for _e in range(num_epochs):
for batch in tqdm(dataloader):
x = batch['x'].to(device)
x_len = batch['x_len'].to(device)
y = batch['y'].to(device)
# Zero gradient.
optimizer.zero_grad()
# Feed forward.
output, hidden = model(x, use_softmax=True)
# Compute loss:
# Shape of the `output` is [batch_size x sequence_len x vocab_size]
# Shape of `y` is [batch_size x sequence_len]
# CrossEntropyLoss expects `output` to be [batch_size x vocab_size x sequence_len]
_, prediction = torch.max(output, dim=2)
loss = criterion(output.permute(0, 2, 1), y)
loss.backward()
optimizer.step()
losses.append(loss.float().data)
clear_output(wait=True)
plt.plot(losses)
plt.pause(0.05)
train(50, dataloader, model, criterion, optimizer)
#learning_rate = 0.05
#optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#train(4, dataloader, model, criterion, optimizer)
但是当模型进行预测时,我们看到它只是在预测“ the”和逗号“,”。
有人发现我的代码有问题吗?还是超参数?
完整代码:
# coding: utf-8
# In[1]:
# IPython candies...
from IPython.display import Image
from IPython.core.display import HTML
from IPython.display import clear_output
# In[2]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from gensim.corpora import Dictionary
import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# In[3]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
sns.set(rc={'figure.figsize':(12, 8)})
torch.manual_seed(42)
# In[4]:
try: # Use the default NLTK tokenizer.
from nltk import word_tokenize, sent_tokenize
# Testing whether it works.
# Sometimes it doesn't work on some machines because of setup issues.
word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
except: # Use a naive sentence tokenizer and toktok.
import re
from nltk.tokenize import ToktokTokenizer
# See https://stackoverflow.com/a/25736515/610569
sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
# Use the toktok tokenizer that requires no dependencies.
toktok = ToktokTokenizer()
word_tokenize = word_tokenize = toktok.tokenize
# In[5]:
import os
import requests
import io #codecs
# Text version of https://kilgarriff.co.uk/Publications/2005-K-lineer.pdf
if os.path.isfile('language-never-random.txt'):
with io.open('language-never-random.txt', encoding='utf8') as fin:
text = fin.read()
else:
url = "https://gist.githubusercontent.com/alvations/53b01e4076573fea47c6057120bb017a/raw/b01ff96a5f76848450e648f35da6497ca9454e4a/language-never-random.txt"
text = requests.get(url).content.decode('utf8')
with io.open('language-never-random.txt', 'w', encoding='utf8') as fout:
fout.write(text)
# In[6]:
# Tokenize the text.
tokenized_text = [list(map(str.lower, word_tokenize(sent)))
for sent in sent_tokenize(text)]
# In[7]:
class KilgariffDataset(nn.Module):
def __init__(self, texts):
self.texts = texts
# Initialize the vocab
special_tokens = {'<pad>': 0, '<unk>':1, '<s>':2, '</s>':3}
self.vocab = Dictionary(texts)
self.vocab.patch_with_special_tokens(special_tokens)
# Keep track of the vocab size.
self.vocab_size = len(self.vocab)
# Keep track of how many data points.
self._len = len(texts)
# Find the longest text in the data.
self.max_len = max(len(txt) for txt in texts)
def __getitem__(self, index):
vectorized_sent = self.vectorize(self.texts[index])
x_len = len(vectorized_sent)
# To pad the sentence:
# Pad left = 0; Pad right = max_len - len of sent.
pad_dim = (0, self.max_len - len(vectorized_sent))
vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
return {'x':vectorized_sent[:-1],
'y':vectorized_sent[1:],
'x_len':x_len}
def __len__(self):
return self._len
def vectorize(self, tokens, start_idx=2, end_idx=3):
"""
:param tokens: Tokens that should be vectorized.
:type tokens: list(str)
"""
# See https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.doc2idx
# Lets just cast list of indices into torch tensors directly =)
vectorized_sent = [start_idx] + self.vocab.doc2idx(tokens) + [end_idx]
return torch.tensor(vectorized_sent)
def unvectorize(self, indices):
"""
:param indices: Converts the indices back to tokens.
:type tokens: list(int)
"""
return [self.vocab[i] for i in indices]
# In[8]:
kilgariff_data = KilgariffDataset(tokenized_text)
len(kilgariff_data.vocab)
# In[9]:
batch_size = 10
dataloader = DataLoader(dataset=kilgariff_data, batch_size=batch_size, shuffle=True)
for data_dict in dataloader:
# Sort indices of data in batch by lengths.
sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
data_batch = {name:_tensor[sorted_indices]
for name, _tensor in data_dict.items()}
print(data_batch)
break
# In[97]:
class Generator(nn.Module):
def __init__(self, vocab_size, embedding_size, hidden_size, num_layers):
super(Generator, self).__init__()
# Initialize the embedding layer with the
# - size of input (i.e. no. of words in input vocab)
# - no. of hidden nodes in the embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
# Initialize the GRU with the
# - size of the input (i.e. embedding layer)
# - size of the hidden layer
self.gru = nn.GRU(embedding_size, hidden_size, num_layers)
# Initialize the "classifier" layer to map the RNN outputs
# to the vocabulary. Remember we need to -1 because the
# vectorized sentence we left out one token for both x and y:
# - size of hidden_size of the GRU output.
# - size of vocabulary
self.classifier = nn.Linear(hidden_size, vocab_size)
def forward(self, inputs, use_softmax=False, hidden=None):
# Look up for the embeddings for the input word indices.
embedded = self.embedding(inputs)
# Put the embedded inputs into the GRU.
output, hidden = self.gru(embedded, hidden)
# Matrix manipulation magic.
batch_size, sequence_len, hidden_size = output.shape
# Technically, linear layer takes a 2-D matrix as input, so more manipulation...
output = output.contiguous().view(batch_size * sequence_len, hidden_size)
# Put it through the classifier
# And reshape it to [batch_size x sequence_len x vocab_size]
output = self.classifier(output).view(batch_size, sequence_len, -1)
return (F.softmax(output,dim=2), hidden) if use_softmax else (output, hidden)
def generate(self, max_len, temperature=1.0):
pass
# In[98]:
# Set the hidden_size of the GRU
embed_size = 12
hidden_size = 10
num_layers = 4
_encoder = Generator(len(kilgariff_data.vocab), embed_size, hidden_size, num_layers)
# In[99]:
# Take a batch.
_batch = next(iter(dataloader))
_inputs, _lengths = _batch['x'], _batch['x_len']
_targets = _batch['y']
max(_lengths)
# In[100]:
_output, _hidden = _encoder(_inputs)
print('Output sizes:\t', _output.shape)
print('Input sizes:\t', batch_size, kilgariff_data.max_len -1, len(kilgariff_data.vocab))
print('Target sizes:\t', _targets.shape)
# In[101]:
_, predicted_indices = torch.max(_output, dim=2)
print(predicted_indices.shape)
predicted_indices
# In[103]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Set the hidden_size of the GRU
embed_size = 100
hidden_size = 100
num_layers = 1
# Setup the data.
batch_size=50
kilgariff_data = KilgariffDataset(tokenized_text)
dataloader = DataLoader(dataset=kilgariff_data, batch_size=batch_size, shuffle=True)
criterion = nn.CrossEntropyLoss(ignore_index=kilgariff_data.vocab.token2id['<pad>'], size_average=True)
model = Generator(len(kilgariff_data.vocab), embed_size, hidden_size, num_layers).to(device)
learning_rate = 0.003
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#model = nn.DataParallel(model)
losses = []
def train(num_epochs, dataloader, model, criterion, optimizer):
plt.ion()
for _e in range(num_epochs):
for batch in tqdm(dataloader):
x = batch['x'].to(device)
x_len = batch['x_len'].to(device)
y = batch['y'].to(device)
# Zero gradient.
optimizer.zero_grad()
# Feed forward.
output, hidden = model(x, use_softmax=True)
# Compute loss:
# Shape of the `output` is [batch_size x sequence_len x vocab_size]
# Shape of `y` is [batch_size x sequence_len]
# CrossEntropyLoss expects `output` to be [batch_size x vocab_size x sequence_len]
_, prediction = torch.max(output, dim=2)
loss = criterion(output.permute(0, 2, 1), y)
loss.backward()
optimizer.step()
losses.append(loss.float().data)
clear_output(wait=True)
plt.plot(losses)
plt.pause(0.05)
train(50, dataloader, model, criterion, optimizer)
#learning_rate = 0.05
#optimizer = optim.SGD(model.parameters(), lr=learning_rate)
#train(4, dataloader, model, criterion, optimizer)
# In[ ]:
list(kilgariff_data.vocab.items())
# In[105]:
start_token = '<s>'
hidden_state = None
max_len = 20
temperature=0.8
i = 0
while start_token not in ['</s>', '<pad>'] and i < max_len:
i += 1
start_state = torch.tensor(kilgariff_data.vocab.token2id[start_token]).unsqueeze(0).unsqueeze(0).to(device)
model.embedding(start_state)
output, hidden_state = model.gru(model.embedding(start_state), hidden_state)
batch_size, sequence_len, hidden_size = output.shape
output = output.contiguous().view(batch_size * sequence_len, hidden_size)
output = model.classifier(output).view(batch_size, sequence_len, -1)
_, prediction = torch.max(F.softmax(output, dim=2), dim=2)
start_token = kilgariff_data.vocab[int(prediction.squeeze(0).squeeze(0))]
print(start_token, end=' ')
答案 0 :(得分:1)
我绝不是PyTorch专家,但是那段代码对我来说似乎很糟糕:
# Put the embedded inputs into the GRU.
output, hidden = self.gru(embedded, hidden)
# Matrix manipulation magic.
batch_size, sequence_len, hidden_size = output.shape
# Technically, linear layer takes a 2-D matrix as input, so more manipulation...
output = output.contiguous().view(batch_size * sequence_len, hidden_size)
GRU
实例化batch_first=True
,则输出形状为(seq_len, batch, num_directions * hidden_size)
,而不是翻转seq_len
和batch_size
。对于view命令,实际上在技术上并不重要,但这是我的主要问题。view(batch_size * sequence_len, hidden_size)
看起来根本不正确。假设您从一批32号开始,但是之后的大小为32*seq_len
。通常,仅使用最后一步的输出(或所有步骤的平均值或最大值)类似的事情应该起作用:
# Put the embedded inputs into the GRU.
output, hidden = self.gru(embedded, hidden)
# Not needed, just to show the true output shape order
seq_len, batch_size, hidden_size = output.shape
# Given the shape of output, this is the last step
output = output[-1]
# output.shape = (batch_size, hidden_size) <-- What you want
两个警告的个人话:
view()
是危险的命令!当张量的尺寸不匹配时,PyTorch或任何其他框架只会引发错误。但是仅仅因为尺寸适合view()
之后并不意味着重整已正确完成,即,值位于输出张量的正确位置。例如,如果您必须将形状(seq_len, batch_size, hidden_size)
展平为(batch_size, seq_len*hidden_size)
,则不能简单地进行view(batch_size, -1)
,而是首先必须进行transpose(1,0)
才能获得形状{{1 }}。在没有(batch_size, seq_len, hidden_size)
的情况下,transpose()
将起作用并且尺寸将是正确的。但只有使用view()
时,值才在transpose()
之后view()
命令是最大的陷阱。如果有帮助,这是GRU分类器网络的view()
方法:
forward
答案 1 :(得分:0)
train()
中的这一行应该是
output, hidden = model(x, use_softmax=False)
在训练时禁用use_softmax
,则模型应该正确训练,并且训练CE损失将减少到接近0。