我正在将Keras代码转换为PyTorch,因为我比前者更熟悉后者。但是,我发现它不是在学习(或几乎没有)。
下面,我提供了几乎所有的PyTorch代码,包括初始化代码,以便您可以自己尝试。您唯一需要提供的就是单词嵌入(我敢肯定您可以在网上找到许多word2vec模型)。第一个输入文件应该是带有标记化文本的文件,第二个输入文件应该是带有浮点数的文件,每行一个。因为我已经提供了所有代码,所以这个问题似乎太大而又太广泛了。但是,我认为我的问题足够具体:我的模型或训练循环中有什么问题导致我的模型无法改善或勉强改善。 (有关结果,请参见下文。)
我尝试提供许多注释(如果适用),并且我还提供了形状转换,因此您没有运行代码来查看发生了什么。数据准备方法并不重要。
最重要的部分是RegressorNet
的前进方法和RegressionNN
的训练循环(不可否认,这些名称选择不当)。我认为错误在某处。
from pathlib import Path
import time
import numpy as np
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import gensim
from scipy.stats import pearsonr
from LazyTextDataset import LazyTextDataset
class RegressorNet(nn.Module):
def __init__(self, hidden_dim, embeddings=None, drop_prob=0.0):
super(RegressorNet, self).__init__()
self.hidden_dim = hidden_dim
self.drop_prob = drop_prob
# Load pretrained w2v model, but freeze it: don't retrain it.
self.word_embeddings = nn.Embedding.from_pretrained(embeddings)
self.word_embeddings.weight.requires_grad = False
self.w2v_rnode = nn.GRU(embeddings.size(1), hidden_dim, bidirectional=True, dropout=drop_prob)
self.dropout = nn.Dropout(drop_prob)
self.linear = nn.Linear(hidden_dim * 2, 1)
# LeakyReLU rather than ReLU so that we don't get stuck in a dead nodes
self.lrelu = nn.LeakyReLU()
def forward(self, batch_size, sentence_input):
# shape sizes for:
# * batch_size 128
# * embeddings of dim 146
# * hidden dim of 200
# * sentence length of 20
# sentence_input: torch.Size([128, 20])
# Get word2vec vector representation
embeds = self.word_embeddings(sentence_input)
# embeds: torch.Size([128, 20, 146])
# embeds.view(-1, batch_size, embeds.size(2)): torch.Size([20, 128, 146])
# Input vectors into GRU, only keep track of output
w2v_out, _ = self.w2v_rnode(embeds.view(-1, batch_size, embeds.size(2)))
# w2v_out = torch.Size([20, 128, 400])
# Leaky ReLU it
w2v_out = self.lrelu(w2v_out)
# Dropout some nodes
if self.drop_prob > 0:
w2v_out = self.dropout(w2v_out)
# w2v_out: torch.Size([20, 128, 400
# w2v_out[-1, :, :]: torch.Size([128, 400])
# Only use the last output of a sequence! Supposedly that cell outputs the final information
regression = self.linear(w2v_out[-1, :, :])
regression: torch.Size([128, 1])
return regression
class RegressionRNN:
def __init__(self, train_files=None, test_files=None, dev_files=None):
print('Using torch ' + torch.__version__)
self.datasets, self.dataloaders = RegressionRNN._set_data_loaders(train_files, test_files, dev_files)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.model = self.w2v_vocab = self.criterion = self.optimizer = self.scheduler = None
@staticmethod
def _set_data_loaders(train_files, test_files, dev_files):
# labels must be the last input file
datasets = {
'train': LazyTextDataset(train_files) if train_files is not None else None,
'test': LazyTextDataset(test_files) if test_files is not None else None,
'valid': LazyTextDataset(dev_files) if dev_files is not None else None
}
dataloaders = {
'train': DataLoader(datasets['train'], batch_size=128, shuffle=True, num_workers=4) if train_files is not None else None,
'test': DataLoader(datasets['test'], batch_size=128, num_workers=4) if test_files is not None else None,
'valid': DataLoader(datasets['valid'], batch_size=128, num_workers=4) if dev_files is not None else None
}
return datasets, dataloaders
@staticmethod
def prepare_lines(data, split_on=None, cast_to=None, min_size=None, pad_str=None, max_size=None, to_numpy=False,
list_internal=False):
""" Converts the string input (line) to an applicable format. """
out = []
for line in data:
line = line.strip()
if split_on:
line = line.split(split_on)
line = list(filter(None, line))
else:
line = [line]
if cast_to is not None:
line = [cast_to(l) for l in line]
if min_size is not None and len(line) < min_size:
# pad line up to a number of tokens
line += (min_size - len(line)) * ['@pad@']
elif max_size and len(line) > max_size:
line = line[:max_size]
if list_internal:
line = [[item] for item in line]
if to_numpy:
line = np.array(line)
out.append(line)
if to_numpy:
out = np.array(out)
return out
def prepare_w2v(self, data):
idxs = []
for seq in data:
tok_idxs = []
for word in seq:
# For every word, get its index in the w2v model.
# If it doesn't exist, use @unk@ (available in the model).
try:
tok_idxs.append(self.w2v_vocab[word].index)
except KeyError:
tok_idxs.append(self.w2v_vocab['@unk@'].index)
idxs.append(tok_idxs)
idxs = torch.tensor(idxs, dtype=torch.long)
return idxs
def train(self, epochs=10):
valid_loss_min = np.Inf
train_losses, valid_losses = [], []
for epoch in range(1, epochs + 1):
epoch_start = time.time()
train_loss, train_results = self._train_valid('train')
valid_loss, valid_results = self._train_valid('valid')
# Calculate Pearson correlation between prediction and target
try:
train_pearson = pearsonr(train_results['predictions'], train_results['targets'])
except FloatingPointError:
train_pearson = "Could not calculate Pearsonr"
try:
valid_pearson = pearsonr(valid_results['predictions'], valid_results['targets'])
except FloatingPointError:
valid_pearson = "Could not calculate Pearsonr"
# calculate average losses
train_loss = np.mean(train_loss)
valid_loss = np.mean(valid_loss)
train_losses.append(train_loss)
valid_losses.append(valid_loss)
# print training/validation statistics
print(f'----------\n'
f'Epoch {epoch} - completed in {(time.time() - epoch_start):.0f} seconds\n'
f'Training Loss: {train_loss:.6f}\t Pearson: {train_pearson}\n'
f'Validation loss: {valid_loss:.6f}\t Pearson: {valid_pearson}')
# validation loss has decreased
if valid_loss <= valid_loss_min and train_loss > valid_loss:
print(f'!! Validation loss decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model ...')
valid_loss_min = valid_loss
if train_loss <= valid_loss:
print('!! Training loss is lte validation loss. Might be overfitting!')
# Optimise with scheduler
if self.scheduler is not None:
self.scheduler.step(valid_loss)
print('Done training...')
def _train_valid(self, do):
""" Do training or validating. """
if do not in ('train', 'valid'):
raise ValueError("Use 'train' or 'valid' for 'do'.")
results = {'predictions': np.array([]), 'targets': np.array([])}
losses = np.array([])
self.model = self.model.to(self.device)
if do == 'train':
self.model.train()
torch.set_grad_enabled(True)
else:
self.model.eval()
torch.set_grad_enabled(False)
for batch_idx, data in enumerate(self.dataloaders[do], 1):
# 1. Data prep
sentence = data[0]
target = data[-1]
curr_batch_size = target.size(0)
# Returns list of tokens, possibly padded @pad@
sentence = self.prepare_lines(sentence, split_on=' ', min_size=20, max_size=20)
# Converts tokens into w2v IDs as a Tensor
sent_w2v_idxs = self.prepare_w2v(sentence)
# Converts output to Tensor of floats
target = torch.Tensor(self.prepare_lines(target, cast_to=float))
# Move input to device
sent_w2v_idxs, target = sent_w2v_idxs.to(self.device), target.to(self.device)
# 2. Predictions
pred = self.model(curr_batch_size, sentence_input=sent_w2v_idxs)
loss = self.criterion(pred, target)
# 3. Optimise during training
if do == 'train':
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# 4. Save results
pred = pred.detach().cpu().numpy()
target = target.cpu().numpy()
results['predictions'] = np.append(results['predictions'], pred, axis=None)
results['targets'] = np.append(results['targets'], target, axis=None)
losses = np.append(losses, float(loss))
torch.set_grad_enabled(True)
return losses, results
if __name__ == '__main__':
HIDDEN_DIM = 200
# Load embeddings from pretrained gensim model
embed_p = Path('path-to.w2v_model').resolve()
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(str(embed_p))
# add a padding token with only zeros
w2v_model.add(['@pad@'], [np.zeros(w2v_model.vectors.shape[1])])
embed_weights = torch.FloatTensor(w2v_model.vectors)
# Text files are used as input. Every line is one datapoint.
# *.tok.low.*: tokenized (space-separated) sentences
# *.cross: one floating point number per line, which we are trying to predict
regr = RegressionRNN(train_files=(r'train.tok.low.en',
r'train.cross'),
dev_files=(r'dev.tok.low.en',
r'dev.cross'),
test_files=(r'test.tok.low.en',
r'test.cross'))
regr.w2v_vocab = w2v_model.vocab
regr.model = RegressorNet(HIDDEN_DIM, embed_weights, drop_prob=0.2)
regr.criterion = nn.MSELoss()
regr.optimizer = optim.Adam(list(regr.model.parameters())[0:], lr=0.001)
regr.scheduler = optim.lr_scheduler.ReduceLROnPlateau(regr.optimizer, 'min', factor=0.1, patience=5, verbose=True)
regr.train(epochs=100)
对于LazyTextDataset,您可以参考下面的类。
from torch.utils.data import Dataset
import linecache
class LazyTextDataset(Dataset):
def __init__(self, paths):
# labels are in the last path
self.paths, self.labels_path = paths[:-1], paths[-1]
with open(self.labels_path, encoding='utf-8') as fhin:
lines = 0
for line in fhin:
if line.strip() != '':
lines += 1
self.num_entries = lines
def __getitem__(self, idx):
data = [linecache.getline(p, idx + 1) for p in self.paths]
label = linecache.getline(self.labels_path, idx + 1)
return (*data, label)
def __len__(self):
return self.num_entries
正如我之前写的,我正在尝试将Keras模型转换为PyTorch。原始的Keras代码不使用嵌入层,而是将每个句子的预构建word2vec向量用作输入。在下面的模型中,没有嵌入层。 Keras摘要看起来像这样(我无权访问基本模型设置)。
Layer (type) Output Shape Param # Connected to
====================================================================================================
bidirectional_1 (Bidirectional) (200, 400) 417600
____________________________________________________________________________________________________
dropout_1 (Dropout) (200, 800) 0 merge_1[0][0]
____________________________________________________________________________________________________
dense_1 (Dense) (200, 1) 801 dropout_1[0][0]
====================================================================================================
问题在于,在输入相同的情况下,Keras模型起作用,并且在预测标签和实际标签之间获得+0.5的Pearson相关性。但是,上面的PyTorch模型似乎根本不起作用。为了让您有个想法,这是第一个时期之后的损耗(均方误差)和Pearson(相关系数,p值):
Epoch 1 - completed in 11 seconds
Training Loss: 1.684495 Pearson: (-0.0006077809280690612, 0.8173368901481127)
Validation loss: 1.708228 Pearson: (0.017794288315261794, 0.4264098054188664)
在第100个时代之后:
Epoch 100 - completed in 11 seconds
Training Loss: 1.660194 Pearson: (0.0020315421756790806, 0.4400929436716754)
Validation loss: 1.704910 Pearson: (-0.017288118524826892, 0.4396865964324158)
损耗在下面绘制(当您查看Y轴时,可以看到最小的改进)。
最后一个指示可能有问题的指标是,对于我的140K输入行,在我的GTX 1080TI上,每个纪元只用了10秒。我觉得他的用处不大,我猜想优化工作或正在运行。我不知道为什么。要发布的内容可能在我的火车循环中或模型本身中,但我找不到它。
同样,某些事情一定会出错,因为: -Keras模型做得好; -140K句子的训练速度“太快” -训练后几乎没有改善。
我想念什么?该问题很可能出现在培训循环或网络结构中。
答案 0 :(得分:5)
TL; DR :交换轴时,使用permute
代替view
,请查看答案的结尾以直观了解差异。
如果使用from_pretrained
,则无需冻结嵌入层。如documentation所述,它不使用渐变更新。
此部分:
self.w2v_rnode = nn.GRU(embeddings.size(1), hidden_dim, bidirectional=True, dropout=drop_prob)
,尤其是dropout
,但没有num_layers
,这是毫无意义的(因为浅层网络无法指定任何丢失)。
错误和主要问题:在您的forward
函数中,您使用的是view
而不是permute
,在这里:
w2v_out, _ = self.w2v_rnode(embeds.view(-1, batch_size, embeds.size(2)))
请参见this answer和有关每个功能的相应文档,并尝试使用此行:
w2v_out, _ = self.w2v_rnode(embeds.permute(1, 0, 2))
您可以考虑在创建batch_first=True
时使用w2v_rnode
参数,而不必以这种方式置换索引。
检查torch.nn.GRU的文档,您是在序列的最后一步之后,而不是在所有序列之后,因此应该在以下位置:
_, last_hidden = self.w2v_rnode(embeds.permute(1, 0, 2))
但是我认为这部分还可以。
没有犯罪行为,但是prepare_lines
非常难以理解,并且似乎也很难维护,更不用说发现最终的错误了(我想它就在这里)。
首先,似乎您正在手动填充。 请不要那样做,请使用torch.nn.pad_sequence处理批处理!
本质上,首先您将每个句子中的每个单词编码为指向嵌入的索引(就像您在prepare_w2v
中所做的那样),然后再使用torch.nn.pad_sequence
和torch.nn.pack_padded_sequence
或 torch.nn.pack_sequence
(如果行已按长度排序)。
这部分非常重要,看来您根本没有这样做(很可能这是实现中的第二个错误)。
PyTorch的RNN单元输入的输入不是填充张量,而是torch.nn.PackedSequence个对象。这是一种有效的对象存储索引,该索引指定每个序列的 unpapped 长度。
查看有关主题here,here以及网络上许多其他博客文章的更多信息。
批次中的第一个序列必须是最长的,而所有其他序列必须以降序长度提供。接下来是:
任何一种都可以,对于您来说似乎更直观的是您的电话。 我喜欢做的事情大致如下,希望对您有所帮助:
torch.utils.data.Dataset
对象,为每个 geem 返回单个句子,该对象以元组的形式返回,该元组由要素(torch.Tensor
)和标签(单个值)组成就像你也一样。collate_fn
以与torch.utils.data.DataLoader一起使用,该torch.nn.pack_sequence
负责在这种情况下对每个批次进行排序和填充(+它将返回要传递到神经网络中的每个句子的长度)。forward
方法中使用了collate_fn
(嵌入!)以将其推送到RNN层。谈到第三点,这里是import torch
def length_sort(features):
# Get length of each sentence in batch
sentences_lengths = torch.tensor(list(map(len, features)))
# Get indices which sort the sentences based on descending length
_, sorter = sentences_lengths.sort(descending=True)
# Pad batch as you have the lengths and sorter saved already
padded_features = torch.nn.utils.rnn.pad_sequence(features, batch_first=True)
return padded_features, sentences_lengths, sorter
def pad_collate_fn(batch):
# DataLoader return batch like that unluckily, check it on your own
features, labels = (
[element[0] for element in batch],
[element[1] for element in batch],
)
padded_features, sentences_lengths, sorter = length_sort(features)
# Sort by length features and labels accordingly
sorted_padded_features, sorted_labels = (
padded_features[sorter],
torch.tensor(labels)[sorter],
)
return sorted_padded_features, sorted_labels, sentences_lengths
的示例实现,您应该了解一下:
collate_fn
在DataLoaders
中将它们用作self.model = self.w2v_vocab = self.criterion = self.optimizer = self.scheduler = None
,您应该就可以了(也许需要进行一些小的调整,因此理解它背后的想法很重要)。
培训循环:很多小错误的好地方,您可能希望使用PyTorch Ignite来将这些错误最小化。我很难经历像Tensorflow一样的Estimator一样的API训练循环(例如padding_idx
)。请不要这样做,请将每个任务(数据创建,数据加载,数据准备,模型设置,训练循环,日志记录)分离到各自的模块中。总而言之,PyTorch / Keras比Tensorflow更具可读性和完整性。
使嵌入的第一行等于矢量包含零。:默认情况下,torch.nn.functional.embedding期望第一行用于填充。因此,您应该为每个单词的唯一索引从1 或开始,或者为不同的值指定一个自变量permute()
(尽管我极力反对这种方法,充其量是令人困惑的)。
我希望这个答案至少对您有所帮助,如果有不清楚的地方,请在下面发表评论,我将尝试从其他角度/更详细地进行解释。
此代码无法复制,也不是问题的具体内容。我们没有您正在使用的数据,也没有您的单词向量,随机种子不固定等。
PS。最后一件事:检查数据的非常小的子集(例如96个示例)的性能,如果不收敛,很可能您的代码中确实存在错误。
关于时间:关于正确和有效的实现,它们可能不合时宜(由于我想不进行排序和填充),通常Keras和PyTorch的时间非常相似(如果我理解问题的这一部分是预期的)。 / p>
这个简单的示例显示了view()
和import torch
a = torch.tensor([[1, 2], [3, 4], [5, 6]])
print(a)
print(a.permute(1, 0))
print(a.view(2, 3))
之间的区别。第一个交换轴,而第二个交换轴不更改内存布局,仅将数组分块为所需的形状(如果可能)。
tensor([[1, 2],
[3, 4],
[5, 6]])
tensor([[1, 3, 5],
[2, 4, 6]])
tensor([[1, 2, 3],
[4, 5, 6]])
输出将是:
reshape
view
几乎类似于numpy
,是为来自view
的人添加的,因此对他们来说更容易,更自然,但有一个重要区别:
reshape
从不复制数据,并且仅在连续的内存上工作(因此,在排列之后,如上面的数据可能不会连续,因此访问起来可能会更慢)