我正在研究神经机器翻译(从Fr到En)。我开发了第一个模型(使用Quadro P5000 GPU 16278MiB),该模型是使用LSTM进行简单的序列到序列的设计。
# Model definition
class EncoderLSTM(nn.Module):
def __init__(
self,
embedding_size,
vocab_size,
hidden_size,
n_layers,
dropout,
recurrent_dropout
):
super(EncoderLSTM, self).__init__()
self.embedding_size = embedding_size
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.embedding = nn.Embedding(vocab_size, embedding_size)
self.lstm = nn.LSTM(embedding_size, hidden_size,
num_layers=n_layers,
bidirectional=True,
dropout=(recurrent_dropout if n_layers > 1 else 0))
def load_pretrained_embeddings(self, embeddings):
self.embedding.weight = nn.Parameter(embeddings)
def fine_tuning_embeddings(self, fine_tune=True):
for p in self.embedding.parameters():
p.requires_grad = fine_tune
def forward(self, input_sequences, sequence_lengths):
"""
:params
input_sequences: Tensor[seq_len, batch_size]
sequence_lengths: Tensor[batch_size,]
:return
outputs: Tensor[seq_len, batch_size, 2 * hidden_size]
hn: Tensor[n_layers * 2, batch_size, hidden_size]
cn: Tensor[n_layers * 2, batch_size, hidden_size]
"""
embedded = self.embedding(input_sequences)
embedded = F.dropout(embedded, p=self.dropout)
packed = nn.utils.rnn.pack_padded_sequence(embedded, sequence_lengths)
outputs, (hn, cn) = self.lstm(packed)
outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
return outputs, hn, cn
class DecoderLSTM(nn.Module):
def __init__(
self,
embedding_size,
vocab_size,
hidden_size,
n_layers,
dropout,
recurrent_dropout
):
super(DecoderLSTM, self).__init__()
self.embedding_size = embedding_size
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.embedding = nn.Embedding(vocab_size, embedding_size)
self.lstm = nn.LSTM(embedding_size, hidden_size,
num_layers=n_layers,
dropout=(recurrent_dropout if n_layers > 1 else 0))
self.fc = nn.Linear(hidden_size, vocab_size)
def load_pretrained_embeddings(self, embeddings):
self.embedding.weight = nn.Parameter(embeddings)
def fine_tuning_embeddings(self, fine_tune=True):
for p in self.embedding.parameters():
p.requires_grad = fine_tune
def forward(self, input_word_index, h_state, c_state):
"""
:params
input_word_index: Tensor[batch_size,]
h_state: Tensor[num_layers, batch_size, hidden_size]
c_state: Tensor[num_layers, batch_size, hidden_size]
:return
logit: Tensor[batch_size, vocab_size]
h_state: Tensor[num_layers, batch_size, hidden_size]
c_state: Tensor[num_layers, batch_size, hidden_size]
"""
embedded = self.embedding(input_word_index.unsqueeze(0))
outputs, (h_state, c_state) = self.lstm(embedded, (h_state, c_state))
logit = self.fc(F.dropout(outputs, p=self.dropout))
logit = logit.squeeze(0)
return logit, h_state, c_state
class SeqToSeqLSTM(nn.Module):
def __init__(self, encoder, decoder, device):
assert encoder.n_layers == decoder.n_layers, \
'Encoder and Decoder must have the same number of reccurent layers'
assert encoder.hidden_size == decoder.hidden_size, \
'Encoder and Decoder must have the same number of reccurrent hidden units'
super(SeqToSeqLSTM, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.init_h0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers)
self.init_c0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers)
self.device = device
def forward(self, src_sequences, src_lengths, dest_sequences, dest_lengths, tf_ratio):
"""
:params
src_sequences: Tensor[seq_len, batch_size]
src_lengths: Tensor[batch_size,]
dest_sequences: Tensor[seq_len, batch_size]
dest_lengths: Tensor[batch_size,]
tf_ratio: float
:return
logits: Tensor[max(decode_lengths), batch_size, vocab_size]
sorted_dest_sequences: Tensor[seq_len, batch_size]
sorted_decode_lengths: Tensor[batch_size,]
sorted_indices: Tensor[batch_size,]
"""
# Encoding
_, h_state, c_state = self.encoder(
input_sequences=src_sequences,
sequence_lengths=src_lengths
)
# h_state: [n_layers * 2, batch_size, hidden_size]
# c_state: [n_layers * 2, batch_size, hidden_size]
# Sort the batch (dest) by decreasing lengths
sorted_dest_lengths, sorted_indices = torch.sort(dest_lengths, dim=0, descending=True)
sorted_dest_sequences = dest_sequences[:, sorted_indices]
h_state = h_state[:, sorted_indices, :]
c_state = c_state[:, sorted_indices, :]
# Init hidden and memory states
h_state = self.init_h0(h_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
c_state = self.init_c0(c_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
h_state = h_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
c_state = c_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
# We won't decode at the <eos> position, since we've finished generating as soon as we generate <eos>
# So, decoding lengths are actual lengths - 1
sorted_decode_lengths = (sorted_dest_lengths - 1).tolist()
# Decoding
batch_size, last = dest_sequences.size(1), None
logits = torch.zeros(max(sorted_decode_lengths), batch_size, self.decoder.vocab_size).to(self.device)
for t in range(max(sorted_decode_lengths)):
batch_size_t = sum([l > t for l in sorted_decode_lengths])
if last is not None:
if random.random() < tf_ratio:
in_ = last[:batch_size_t]
else:
in_ = sorted_dest_sequences[t, :batch_size_t]
else:
in_ = sorted_dest_sequences[t, :batch_size_t]
# in_ [batch_size,]
logit, h_state, c_state = self.decoder(
in_,
h_state[:, :batch_size_t, :].contiguous(),
c_state[:, :batch_size_t, :].contiguous()
)
# logit: [batch_size, vocab_size]
# h_state: [num_layers, batch_size, hidden_size]
# c_state: [num_layers, batch_size, hidden_size]
logits[t, :batch_size_t, :] = logit
last = torch.argmax(F.softmax(logit, dim=1), dim=1) # [batch_size,]
return logits, sorted_dest_sequences, sorted_decode_lengths, sorted_indices
# Model initialization
MODEL_NAME = 'seq2seq-lstm'
N_LAYERS = 4
HIDDEN_SIZE = 512
EMBEDDING_SIZE = 300
ENC_DROPOUT = 0.3
ENC_RECURRENT_DROPOUT = 0.25
DEC_DROPOUT = 0.15
DEC_RECURRENT_DROPOUT = 0.2
N_EPOCHS = 15
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
GRAD_CLIP = 1.0
TF_RATIO = 1.0
encoder = EncoderLSTM(
embedding_size=EMBEDDING_SIZE,
vocab_size=len(FR.vocab),
hidden_size=HIDDEN_SIZE,
n_layers=N_LAYERS,
dropout=ENC_DROPOUT,
recurrent_dropout=ENC_RECURRENT_DROPOUT
)
encoder.load_pretrained_embeddings(fr_embeddings)
encoder.fine_tuning_embeddings(fine_tune=True)
decoder = DecoderLSTM(
embedding_size=EMBEDDING_SIZE,
vocab_size=len(EN.vocab),
hidden_size=HIDDEN_SIZE,
n_layers=N_LAYERS,
dropout=DEC_DROPOUT,
recurrent_dropout=DEC_RECURRENT_DROPOUT
)
decoder.load_pretrained_embeddings(en_embeddings)
decoder.fine_tuning_embeddings(fine_tune=True)
seq2seq = SeqToSeqLSTM(encoder=encoder, decoder=decoder, device=DEVICE)
seq2seq.apply(torch_utils.xavier_init_weights)
seq2seq.to(DEVICE)
optimizer = optim.RMSprop(params=seq2seq.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
print(f'Number of parameters of the model: {torch_utils.count_parameters(seq2seq):,}')
# Number of parameters of the model: 41,471,097
使用上述模型,一个时期的训练时间为04:31分钟。
当我添加注意力机制(Luong风格)时,一个纪元的训练时间为1:55:47小时。我想知道为什么吗?
# Model definition
class EncoderLSTM(nn.Module):
def __init__(
self,
embedding_size,
vocab_size,
hidden_size,
n_layers,
dropout,
recurrent_dropout
):
super(EncoderLSTM, self).__init__()
self.embedding_size = embedding_size
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.embedding = nn.Embedding(vocab_size, embedding_size)
self.lstm = nn.LSTM(embedding_size, hidden_size,
num_layers=n_layers,
bidirectional=True,
dropout=(recurrent_dropout if n_layers > 1 else 0))
def load_pretrained_embeddings(self, embeddings):
self.embedding.weight = nn.Parameter(embeddings)
def fine_tuning_embeddings(self, fine_tune=True):
for p in self.embedding.parameters():
p.requires_grad = fine_tune
def forward(self, input_sequences, sequence_lengths):
"""
:params
input_sequences: Tensor[seq_len, batch_size]
sequence_lengths: Tensor[batch_size,]
:return
outputs: Tensor[seq_len, batch_size, 2 * hidden_size]
hn: Tensor[n_layers * 2, batch_size, hidden_size]
cn: Tensor[n_layers * 2, batch_size, hidden_size]
"""
embedded = self.embedding(input_sequences)
embedded = F.dropout(embedded, p=self.dropout)
packed = nn.utils.rnn.pack_padded_sequence(embedded, sequence_lengths)
outputs, (hn, cn) = self.lstm(packed)
outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
return outputs, hn, cn
class LuongAttention(nn.Module):
def __init__(self, hidden_size, method):
if method not in ['dot', 'concat']:
raise NotImplemented(f'The {method} attention is not defined!')
super(LuongAttention, self).__init__()
self.hidden_size = hidden_size
self.method = method
if method == 'dot':
pass
elif method == 'concat':
self.W = nn.Linear(hidden_size, hidden_size)
self.V = nn.Linear(hidden_size, 1)
else:
raise NotImplementedError(f'{method} not implemented!')
def forward(self, h_state, enc_outputs, mask):
"""
:args
h_state: Tensor[n_layers, batch_size, hidden_size]
enc_outputs: Tensor[seq_len, batch_size, hidden_size]
mask: Tensor[seq_len, batch_size]
:return
attn_weights: Tensor[seq_len, batch_size, 1]
"""
if h_state.shape[0] > 1:
h_state = h_state.sum(dim=0) # [batch_size, hidden_size]
h_state = h_state.unsqueeze(0) # [1, batch_size, hidden_size]
# Calculating the alignment scores
if self.method == 'dot':
scores = torch.sum(h_state * enc_outputs, dim=2)
scores = scores.unsqueeze(dim=2) / np.sqrt(self.hidden_size) # [seq_len, batch_size, 1]
elif self.method == 'concat':
scores = self.V(
torch.tanh(self.W(
enc_outputs + h_state # [seq_len, batch_size, hidden_size]
))
) # [seq_len, batch_size, 1]
else:
raise NotImplementedError(f'{method} not implemented!')
# Apply mask to ignore <pad> tokens
mask = mask.unsqueeze(2) # [seq_len, batch_size, 1]
scores = scores.masked_fill(mask == 0, -1e10)
# Calculating the attention weights by softmaxing the alignment scores
attn_weights = F.softmax(scores, dim=1) # [seq_len, batch_size, 1]
return attn_weights
class DecoderLSTM(nn.Module):
def __init__(
self,
embedding_size,
vocab_size,
hidden_size,
n_layers,
dropout,
recurrent_dropout,
attention
):
super(DecoderLSTM, self).__init__()
self.embedding_size = embedding_size
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.dropout = dropout
self.recurrent_dropout = recurrent_dropout
self.attention = attention
self.embedding = nn.Embedding(vocab_size, embedding_size)
self.lstm = nn.LSTM(embedding_size, hidden_size,
num_layers=n_layers,
dropout=(recurrent_dropout if n_layers > 1 else 0))
self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
self.fc2 = nn.Linear(hidden_size, vocab_size)
def load_pretrained_embeddings(self, embeddings):
self.embedding.weight = nn.Parameter(embeddings)
def fine_tuning_embeddings(self, fine_tune=True):
for p in self.embedding.parameters():
p.requires_grad = fine_tune
def forward(self, input_word_index, h_state, c_state, enc_outputs, mask):
"""
:params
input_word_index: Tensor[batch_size,]
h_state: Tensor[num_layers, batch_size, hidden_size]
c_state: Tensor[num_layers, batch_size, hidden_size]
enc_outputs: Tensor[seq_len, batch_size, hidden_size]
mask: Tensor[seq_len, batch_size]
:return
logit: Tensor[batch_size, vocab_size]
h_state: Tensor[num_layers, batch_size, hidden_size]
c_state: Tensor[num_layers, batch_size, hidden_size]
attn_weights: Tensor[batch_size, seq_len]
"""
embedded = self.embedding(input_word_index.unsqueeze(0)) # [seq_len=1, batch_size, embedding_size]
outputs, (h_state, c_state) = self.lstm(embedded, (h_state, c_state))
# outputs: [seq_len=1, batch_size, hidden_size]
# h_state: [n_layers, batch_size, hidden_size]
# c_state: [n_layers, batch_size, hidden_size]
# Compute Attention Weights
attn_weights = self.attention(h_state=outputs,
enc_outputs=enc_outputs,
mask=mask) # [seq_len, batch_size, 1]
# Compute Context Vector
context_vector = torch.bmm(
enc_outputs.permute(1, 2, 0), # [batch_size, hidden_size, seq_len]
attn_weights.permute(1, 0, 2), # [batch_size, seq_len, 1]
).permute(2, 0, 1) # [1, batch_size, hidden_size]
# New input: concatenate context_vector with hidden_states
new_input = torch.cat((context_vector, outputs), dim=2) # [1, batch_size, hidden_size * 2]
# Get logit
out = torch.tanh(self.fc1(new_input.squeeze(0)))
logit = self.fc2(out) # [batch_size, vocab_size]
return logit, h_state, c_state, attn_weights.squeeze(2)
class SeqToSeqLSTM(nn.Module):
def __init__(self, encoder, decoder, pad_index, device):
assert encoder.n_layers == decoder.n_layers, \
'Encoder and Decoder must have the same number of reccurent layers'
assert encoder.hidden_size == decoder.hidden_size, \
'Encoder and Decoder must have the same number of reccurrent hidden units'
super(SeqToSeqLSTM, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.pad_index = pad_index
self.init_h0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers)
self.init_c0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers)
self.fc = nn.Linear(2 * encoder.hidden_size, encoder.hidden_size)
self.device = device
def create_mask(self, src_sequences):
"""
:params
src_sequences: Tensor[seq_len, batch_size]
:return
mask: Tensor[seq_len, batch_size]
"""
mask = (src_sequences != self.pad_index)
return mask
def forward(self, src_sequences, src_lengths, dest_sequences, dest_lengths, tf_ratio):
"""
:params
src_sequences: Tensor[seq_len, batch_size]
src_lengths: Tensor[batch_size,]
dest_sequences: Tensor[seq_len, batch_size]
dest_lengths: Tensor[batch_size,]
tf_ratio: float
:return
logits: Tensor[max(decode_lengths), batch_size, vocab_size]
sorted_dest_sequences: Tensor[seq_len, batch_size]
sorted_decode_lengths: Tensor[batch_size,]
sorted_indices: Tensor[batch_size,]
"""
mask = self.create_mask(src_sequences) # [seq_len, batch_size]
# Encoding
enc_outputs, h_state, c_state = self.encoder(
input_sequences=src_sequences,
sequence_lengths=src_lengths
)
# enc_outputs: [seq_len, batch_size, 2 * hidden_size]
# h_state: [n_layers * 2, batch_size, hidden_size]
# c_state: [n_layers * 2, batch_size, hidden_size]
enc_outputs = self.fc(enc_outputs)
# enc_outputs: [seq_len, batch_size, hidden_size]
# Sort the batch (dest) by decreasing lengths
sorted_dest_lengths, sorted_indices = torch.sort(dest_lengths, dim=0, descending=True)
sorted_dest_sequences = dest_sequences[:, sorted_indices]
enc_outputs = enc_outputs[:, sorted_indices, :]
h_state = h_state[:, sorted_indices, :]
c_state = c_state[:, sorted_indices, :]
# Init hidden and memory states
h_state = self.init_h0(h_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
c_state = self.init_c0(c_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
h_state = h_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
c_state = c_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
# We won't decode at the <eos> position, since we've finished generating as soon as we generate <eos>
# So, decoding lengths are actual lengths - 1
sorted_decode_lengths = (sorted_dest_lengths - 1).tolist()
# Decoding
batch_size, last = dest_sequences.size(1), None
logits = torch.zeros(max(sorted_decode_lengths), batch_size, self.decoder.vocab_size).to(self.device)
for t in range(max(sorted_decode_lengths)):
batch_size_t = sum([l > t for l in sorted_decode_lengths])
if last is not None:
if random.random() < tf_ratio:
in_ = last[:batch_size_t]
else:
in_ = sorted_dest_sequences[t, :batch_size_t]
else:
in_ = sorted_dest_sequences[t, :batch_size_t]
# in_ [batch_size,]
logit, h_state, c_state, _ = self.decoder(
in_,
h_state[:, :batch_size_t, :].contiguous(),
c_state[:, :batch_size_t, :].contiguous(),
enc_outputs[:, :batch_size_t, :],
mask[:, :batch_size_t]
)
# logit: [batch_size, vocab_size]
# h_state: [num_layers, batch_size, hidden_size]
# c_state: [num_layers, batch_size, hidden_size]
logits[t, :batch_size_t, :] = logit
last = torch.argmax(F.softmax(logit, dim=1), dim=1) # [batch_size,]
return logits, sorted_dest_sequences, sorted_decode_lengths, sorted_indices
# Model initialization
encoder = EncoderLSTM(embedding_size=EMBEDDING_SIZE,
vocab_size=len(FR.vocab),
hidden_size=HIDDEN_SIZE,
n_layers=N_LAYERS,
dropout=ENC_DROPOUT,
recurrent_dropout=ENC_RECURRENT_DROPOUT)
encoder.load_pretrained_embeddings(fr_embeddings)
encoder.fine_tuning_embeddings(fine_tune=True)
attention = LuongAttention(hidden_size=HIDDEN_SIZE, method='dot')
decoder = DecoderLSTM(embedding_size=EMBEDDING_SIZE,
vocab_size=len(EN.vocab),
hidden_size=HIDDEN_SIZE,
n_layers=N_LAYERS,
dropout=DEC_DROPOUT,
recurrent_dropout=DEC_RECURRENT_DROPOUT,
attention=attention)
decoder.load_pretrained_embeddings(en_embeddings)
decoder.fine_tuning_embeddings(fine_tune=True)
seq2seq = SeqToSeqLSTM(encoder=encoder,
decoder=decoder,
pad_index=EN.vocab.stoi[EN.pad_token],
device=DEVICE)
seq2seq.apply(torch_utils.xavier_init_weights)
seq2seq.to(DEVICE)
optimizer = optim.RMSprop(params=seq2seq.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
print(f'Number of parameters of the model: {torch_utils.count_parameters(seq2seq):,}')
# Number of parameters of the model: 42,520,697
两个模型的参数数量几乎相同。
数据来自europarl并行语料库,其中有89,752个示例,每个示例的长度在15到25之间。
有人知道为什么带有注意力机制的模型要花这么长时间吗?