为什么我的一个模型训练需要将近2个小时?

时间:2020-08-21 22:03:38

标签: python nlp pytorch lstm

我正在研究神经机器翻译(从Fr到En)。我开发了第一个模型(使用Quadro P5000 GPU 16278MiB),该模型是使用LSTM进行简单的序列到序列的设计。

# Model definition

class EncoderLSTM(nn.Module):
    
    def __init__(
        self,
        embedding_size,
        vocab_size,
        hidden_size,
        n_layers,
        dropout,
        recurrent_dropout
    ):
        super(EncoderLSTM, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.recurrent_dropout = recurrent_dropout
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size,
                            num_layers=n_layers,
                            bidirectional=True,
                            dropout=(recurrent_dropout if n_layers > 1 else 0))
        
    def load_pretrained_embeddings(self, embeddings):
        self.embedding.weight = nn.Parameter(embeddings)
        
    def fine_tuning_embeddings(self, fine_tune=True):
        for p in self.embedding.parameters():
            p.requires_grad = fine_tune
    
    def forward(self, input_sequences, sequence_lengths):
        """
        :params
            input_sequences: Tensor[seq_len, batch_size]
            sequence_lengths: Tensor[batch_size,]
            
        :return
            outputs: Tensor[seq_len, batch_size, 2 * hidden_size]
            hn: Tensor[n_layers * 2, batch_size, hidden_size]
            cn: Tensor[n_layers * 2, batch_size, hidden_size]
        """
        embedded = self.embedding(input_sequences)
        embedded = F.dropout(embedded, p=self.dropout)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, sequence_lengths)
        outputs, (hn, cn) = self.lstm(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        return outputs, hn, cn


class DecoderLSTM(nn.Module):
    
    def __init__(
        self,
        embedding_size,
        vocab_size,
        hidden_size,
        n_layers,
        dropout,
        recurrent_dropout
    ):
        super(DecoderLSTM, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.recurrent_dropout = recurrent_dropout
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size,
                            num_layers=n_layers,
                            dropout=(recurrent_dropout if n_layers > 1 else 0))
        self.fc = nn.Linear(hidden_size, vocab_size)
        
    def load_pretrained_embeddings(self, embeddings):
        self.embedding.weight = nn.Parameter(embeddings)
        
    def fine_tuning_embeddings(self, fine_tune=True):
        for p in self.embedding.parameters():
            p.requires_grad = fine_tune
        
    def forward(self, input_word_index, h_state, c_state):
        """
        :params
            input_word_index: Tensor[batch_size,]
            h_state: Tensor[num_layers, batch_size, hidden_size]
            c_state: Tensor[num_layers, batch_size, hidden_size]
            
        :return
            logit: Tensor[batch_size, vocab_size]
            h_state: Tensor[num_layers, batch_size, hidden_size]
            c_state: Tensor[num_layers, batch_size, hidden_size]
        """
        embedded = self.embedding(input_word_index.unsqueeze(0))
        outputs, (h_state, c_state) = self.lstm(embedded, (h_state, c_state))
        logit = self.fc(F.dropout(outputs, p=self.dropout))
        logit = logit.squeeze(0)
        return logit, h_state, c_state


class SeqToSeqLSTM(nn.Module):
    
    def __init__(self, encoder, decoder, device):
        assert encoder.n_layers == decoder.n_layers, \
            'Encoder and Decoder must have the same number of reccurent layers'
        assert encoder.hidden_size == decoder.hidden_size, \
            'Encoder and Decoder must have the same number of reccurrent hidden units'
        
        super(SeqToSeqLSTM, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.init_h0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers) 
        self.init_c0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers)
        self.device = device
        
    def forward(self, src_sequences, src_lengths, dest_sequences, dest_lengths, tf_ratio):
        """
        :params
            src_sequences: Tensor[seq_len, batch_size]
            src_lengths: Tensor[batch_size,]
            dest_sequences: Tensor[seq_len, batch_size]
            dest_lengths: Tensor[batch_size,]
            tf_ratio: float
            
        :return
            logits: Tensor[max(decode_lengths), batch_size, vocab_size]
            sorted_dest_sequences: Tensor[seq_len, batch_size]
            sorted_decode_lengths: Tensor[batch_size,]
            sorted_indices: Tensor[batch_size,]
        """
        # Encoding
        _, h_state, c_state = self.encoder(
            input_sequences=src_sequences,
            sequence_lengths=src_lengths
        )
        # h_state: [n_layers * 2, batch_size, hidden_size]
        # c_state: [n_layers * 2, batch_size, hidden_size]
        
        # Sort the batch (dest) by decreasing lengths
        sorted_dest_lengths, sorted_indices = torch.sort(dest_lengths, dim=0, descending=True)
        sorted_dest_sequences = dest_sequences[:, sorted_indices]
        h_state = h_state[:, sorted_indices, :]
        c_state = c_state[:, sorted_indices, :]
        
        # Init hidden and memory states
        h_state = self.init_h0(h_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
        c_state = self.init_c0(c_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
        h_state = h_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
        c_state = c_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
        
        # We won't decode at the <eos> position, since we've finished generating as soon as we generate <eos>
        # So, decoding lengths are actual lengths - 1
        sorted_decode_lengths = (sorted_dest_lengths - 1).tolist()
        
        # Decoding
        batch_size, last = dest_sequences.size(1), None
        logits = torch.zeros(max(sorted_decode_lengths), batch_size, self.decoder.vocab_size).to(self.device)
        for t in range(max(sorted_decode_lengths)):
            batch_size_t = sum([l > t for l in sorted_decode_lengths])
            if last is not None:
                if random.random() < tf_ratio:
                    in_ = last[:batch_size_t]
                else:
                    in_ = sorted_dest_sequences[t, :batch_size_t]
            else:
                in_ = sorted_dest_sequences[t, :batch_size_t]
            # in_ [batch_size,]
            logit, h_state, c_state = self.decoder(
                in_, 
                h_state[:, :batch_size_t, :].contiguous(),
                c_state[:, :batch_size_t, :].contiguous()
            )
            # logit: [batch_size, vocab_size]
            # h_state: [num_layers, batch_size, hidden_size]
            # c_state: [num_layers, batch_size, hidden_size]
            logits[t, :batch_size_t, :] = logit
            last = torch.argmax(F.softmax(logit, dim=1), dim=1) # [batch_size,]
        
        return logits, sorted_dest_sequences, sorted_decode_lengths, sorted_indices

# Model initialization
MODEL_NAME = 'seq2seq-lstm'
N_LAYERS = 4
HIDDEN_SIZE = 512
EMBEDDING_SIZE = 300
ENC_DROPOUT = 0.3
ENC_RECURRENT_DROPOUT = 0.25
DEC_DROPOUT = 0.15
DEC_RECURRENT_DROPOUT = 0.2
N_EPOCHS = 15
BATCH_SIZE = 64
LEARNING_RATE = 1e-3
GRAD_CLIP = 1.0
TF_RATIO = 1.0

encoder = EncoderLSTM(
    embedding_size=EMBEDDING_SIZE,
    vocab_size=len(FR.vocab),
    hidden_size=HIDDEN_SIZE,
    n_layers=N_LAYERS,
    dropout=ENC_DROPOUT,
    recurrent_dropout=ENC_RECURRENT_DROPOUT
)
encoder.load_pretrained_embeddings(fr_embeddings)
encoder.fine_tuning_embeddings(fine_tune=True)
decoder = DecoderLSTM(
    embedding_size=EMBEDDING_SIZE,
    vocab_size=len(EN.vocab),
    hidden_size=HIDDEN_SIZE,
    n_layers=N_LAYERS,
    dropout=DEC_DROPOUT,
    recurrent_dropout=DEC_RECURRENT_DROPOUT
)
decoder.load_pretrained_embeddings(en_embeddings)
decoder.fine_tuning_embeddings(fine_tune=True)
seq2seq = SeqToSeqLSTM(encoder=encoder, decoder=decoder, device=DEVICE)
seq2seq.apply(torch_utils.xavier_init_weights)
seq2seq.to(DEVICE)
optimizer = optim.RMSprop(params=seq2seq.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
print(f'Number of parameters of the model: {torch_utils.count_parameters(seq2seq):,}')
# Number of parameters of the model: 41,471,097

使用上述模型,一个时期的训练时间为04:31分钟。

当我添加注意力机制(Luong风格)时,一个纪元的训练时间为1:55:47小时。我想知道为什么吗?

# Model definition
class EncoderLSTM(nn.Module):
    
    def __init__(
        self,
        embedding_size,
        vocab_size,
        hidden_size,
        n_layers,
        dropout,
        recurrent_dropout
    ):
        super(EncoderLSTM, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.recurrent_dropout = recurrent_dropout
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size,
                            num_layers=n_layers,
                            bidirectional=True,
                            dropout=(recurrent_dropout if n_layers > 1 else 0))
        
    def load_pretrained_embeddings(self, embeddings):
        self.embedding.weight = nn.Parameter(embeddings)
        
    def fine_tuning_embeddings(self, fine_tune=True):
        for p in self.embedding.parameters():
            p.requires_grad = fine_tune
    
    def forward(self, input_sequences, sequence_lengths):
        """
        :params
            input_sequences: Tensor[seq_len, batch_size]
            sequence_lengths: Tensor[batch_size,]
            
        :return
            outputs: Tensor[seq_len, batch_size, 2 * hidden_size]
            hn: Tensor[n_layers * 2, batch_size, hidden_size]
            cn: Tensor[n_layers * 2, batch_size, hidden_size]
        """
        embedded = self.embedding(input_sequences)
        embedded = F.dropout(embedded, p=self.dropout)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, sequence_lengths)
        outputs, (hn, cn) = self.lstm(packed)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs)
        return outputs, hn, cn


class LuongAttention(nn.Module):
    
    def __init__(self, hidden_size, method):
        if method not in ['dot', 'concat']:
            raise NotImplemented(f'The {method} attention is not defined!')
        
        super(LuongAttention, self).__init__()
        self.hidden_size = hidden_size
        self.method = method
        if method == 'dot':
            pass
        elif method == 'concat':
            self.W = nn.Linear(hidden_size, hidden_size)
            self.V = nn.Linear(hidden_size, 1)
        else:
            raise NotImplementedError(f'{method} not implemented!')
            
    def forward(self, h_state, enc_outputs, mask):
        """
        :args
        h_state: Tensor[n_layers, batch_size, hidden_size]
        enc_outputs: Tensor[seq_len, batch_size, hidden_size]
        mask: Tensor[seq_len, batch_size]

        :return
            attn_weights: Tensor[seq_len, batch_size, 1]
        """
        if h_state.shape[0] > 1:
            h_state = h_state.sum(dim=0) # [batch_size, hidden_size]
            h_state = h_state.unsqueeze(0) # [1, batch_size, hidden_size]

        # Calculating the alignment scores
        if self.method == 'dot':
            scores = torch.sum(h_state * enc_outputs, dim=2)
            scores = scores.unsqueeze(dim=2) / np.sqrt(self.hidden_size) # [seq_len, batch_size, 1]
        elif self.method == 'concat':
            scores = self.V(
                torch.tanh(self.W(
                    enc_outputs + h_state # [seq_len, batch_size, hidden_size]
                ))
            ) # [seq_len, batch_size, 1]
        else:
            raise NotImplementedError(f'{method} not implemented!')

        # Apply mask to ignore <pad> tokens
        mask = mask.unsqueeze(2) # [seq_len, batch_size, 1]
        scores = scores.masked_fill(mask == 0, -1e10)

        # Calculating the attention weights by softmaxing the alignment scores
        attn_weights = F.softmax(scores, dim=1) # [seq_len, batch_size, 1]

        return attn_weights


class DecoderLSTM(nn.Module):
    
    def __init__(
        self,
        embedding_size,
        vocab_size,
        hidden_size,
        n_layers,
        dropout,
        recurrent_dropout,
        attention
    ):
        super(DecoderLSTM, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.recurrent_dropout = recurrent_dropout
        self.attention = attention
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size,
                            num_layers=n_layers,
                            dropout=(recurrent_dropout if n_layers > 1 else 0))
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
        
    def load_pretrained_embeddings(self, embeddings):
        self.embedding.weight = nn.Parameter(embeddings)
        
    def fine_tuning_embeddings(self, fine_tune=True):
        for p in self.embedding.parameters():
            p.requires_grad = fine_tune
        
    def forward(self, input_word_index, h_state, c_state, enc_outputs, mask):
        """
        :params
            input_word_index: Tensor[batch_size,]
            h_state: Tensor[num_layers, batch_size, hidden_size]
            c_state: Tensor[num_layers, batch_size, hidden_size]
            enc_outputs: Tensor[seq_len, batch_size, hidden_size] 
            mask: Tensor[seq_len, batch_size]
            
        :return
            logit: Tensor[batch_size, vocab_size]
            h_state: Tensor[num_layers, batch_size, hidden_size]
            c_state: Tensor[num_layers, batch_size, hidden_size]
            attn_weights: Tensor[batch_size, seq_len]
        """
        embedded = self.embedding(input_word_index.unsqueeze(0)) # [seq_len=1, batch_size, embedding_size]
        outputs, (h_state, c_state) = self.lstm(embedded, (h_state, c_state))
        # outputs: [seq_len=1, batch_size, hidden_size]
        # h_state: [n_layers, batch_size, hidden_size]
        # c_state: [n_layers, batch_size, hidden_size]
        
        # Compute Attention Weights
        attn_weights = self.attention(h_state=outputs,
                                      enc_outputs=enc_outputs,
                                      mask=mask) # [seq_len, batch_size, 1]
        
        # Compute Context Vector
        context_vector = torch.bmm(
            enc_outputs.permute(1, 2, 0), # [batch_size, hidden_size, seq_len]
            attn_weights.permute(1, 0, 2), # [batch_size, seq_len, 1]
        ).permute(2, 0, 1) # [1, batch_size, hidden_size]
        
        # New input: concatenate context_vector with hidden_states
        new_input = torch.cat((context_vector, outputs), dim=2) # [1, batch_size, hidden_size * 2]
        
        # Get logit
        out = torch.tanh(self.fc1(new_input.squeeze(0)))
        logit = self.fc2(out) # [batch_size, vocab_size]
        
        return logit, h_state, c_state, attn_weights.squeeze(2)

class SeqToSeqLSTM(nn.Module):
    
    def __init__(self, encoder, decoder, pad_index, device):
        assert encoder.n_layers == decoder.n_layers, \
            'Encoder and Decoder must have the same number of reccurent layers'
        assert encoder.hidden_size == decoder.hidden_size, \
            'Encoder and Decoder must have the same number of reccurrent hidden units'
        
        super(SeqToSeqLSTM, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.pad_index = pad_index
        self.init_h0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers) 
        self.init_c0 = nn.Linear(decoder.n_layers * 2, decoder.n_layers)
        self.fc = nn.Linear(2 * encoder.hidden_size, encoder.hidden_size)
        self.device = device
        
    def create_mask(self, src_sequences):
        """
        :params
            src_sequences: Tensor[seq_len, batch_size]
            
        :return
            mask: Tensor[seq_len, batch_size]
        """
        mask = (src_sequences != self.pad_index)
        return mask
        
    def forward(self, src_sequences, src_lengths, dest_sequences, dest_lengths, tf_ratio):
        """
        :params
            src_sequences: Tensor[seq_len, batch_size]
            src_lengths: Tensor[batch_size,]
            dest_sequences: Tensor[seq_len, batch_size]
            dest_lengths: Tensor[batch_size,]
            tf_ratio: float
            
        :return
            logits: Tensor[max(decode_lengths), batch_size, vocab_size]
            sorted_dest_sequences: Tensor[seq_len, batch_size]
            sorted_decode_lengths: Tensor[batch_size,]
            sorted_indices: Tensor[batch_size,]
        """
        mask = self.create_mask(src_sequences) # [seq_len, batch_size]
        
        # Encoding
        enc_outputs, h_state, c_state = self.encoder(
            input_sequences=src_sequences,
            sequence_lengths=src_lengths
        )
        # enc_outputs: [seq_len, batch_size, 2 * hidden_size]
        # h_state: [n_layers * 2, batch_size, hidden_size]
        # c_state: [n_layers * 2, batch_size, hidden_size]
        
        enc_outputs = self.fc(enc_outputs)
        # enc_outputs: [seq_len, batch_size, hidden_size]
        
        # Sort the batch (dest) by decreasing lengths
        sorted_dest_lengths, sorted_indices = torch.sort(dest_lengths, dim=0, descending=True)
        sorted_dest_sequences = dest_sequences[:, sorted_indices]
        enc_outputs = enc_outputs[:, sorted_indices, :]
        h_state = h_state[:, sorted_indices, :]
        c_state = c_state[:, sorted_indices, :]
        
        # Init hidden and memory states
        h_state = self.init_h0(h_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
        c_state = self.init_c0(c_state.permute(1, 2, 0)) # [batch_size, hidden_size, n_layers]
        h_state = h_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
        c_state = c_state.permute(2, 0, 1) # [n_layers, batch_size, hidden_size]
        
        # We won't decode at the <eos> position, since we've finished generating as soon as we generate <eos>
        # So, decoding lengths are actual lengths - 1
        sorted_decode_lengths = (sorted_dest_lengths - 1).tolist()
        
        # Decoding
        batch_size, last = dest_sequences.size(1), None
        logits = torch.zeros(max(sorted_decode_lengths), batch_size, self.decoder.vocab_size).to(self.device)
        for t in range(max(sorted_decode_lengths)):
            batch_size_t = sum([l > t for l in sorted_decode_lengths])
            if last is not None:
                if random.random() < tf_ratio:
                    in_ = last[:batch_size_t]
                else:
                    in_ = sorted_dest_sequences[t, :batch_size_t]
            else:
                in_ = sorted_dest_sequences[t, :batch_size_t]
            # in_ [batch_size,]
            logit, h_state, c_state, _ = self.decoder(
                in_, 
                h_state[:, :batch_size_t, :].contiguous(),
                c_state[:, :batch_size_t, :].contiguous(),
                enc_outputs[:, :batch_size_t, :],
                mask[:, :batch_size_t]
            )
            # logit: [batch_size, vocab_size]
            # h_state: [num_layers, batch_size, hidden_size]
            # c_state: [num_layers, batch_size, hidden_size]
            logits[t, :batch_size_t, :] = logit
            last = torch.argmax(F.softmax(logit, dim=1), dim=1) # [batch_size,]
        
        return logits, sorted_dest_sequences, sorted_decode_lengths, sorted_indices

# Model initialization
encoder = EncoderLSTM(embedding_size=EMBEDDING_SIZE,
                      vocab_size=len(FR.vocab),
                      hidden_size=HIDDEN_SIZE,
                      n_layers=N_LAYERS,
                      dropout=ENC_DROPOUT,
                      recurrent_dropout=ENC_RECURRENT_DROPOUT)
encoder.load_pretrained_embeddings(fr_embeddings)
encoder.fine_tuning_embeddings(fine_tune=True)
attention = LuongAttention(hidden_size=HIDDEN_SIZE, method='dot')
decoder = DecoderLSTM(embedding_size=EMBEDDING_SIZE,
                      vocab_size=len(EN.vocab),
                      hidden_size=HIDDEN_SIZE,
                      n_layers=N_LAYERS,
                      dropout=DEC_DROPOUT,
                      recurrent_dropout=DEC_RECURRENT_DROPOUT,
                      attention=attention)
decoder.load_pretrained_embeddings(en_embeddings)
decoder.fine_tuning_embeddings(fine_tune=True)
seq2seq = SeqToSeqLSTM(encoder=encoder,
                       decoder=decoder,
                       pad_index=EN.vocab.stoi[EN.pad_token],
                       device=DEVICE)
seq2seq.apply(torch_utils.xavier_init_weights)
seq2seq.to(DEVICE)
optimizer = optim.RMSprop(params=seq2seq.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
print(f'Number of parameters of the model: {torch_utils.count_parameters(seq2seq):,}')
# Number of parameters of the model: 42,520,697

两个模型的参数数量几乎相同。

数据来自europarl并行语料库,其中有89,752个示例,每个示例的长度在15到25之间。

有人知道为什么带有注意力机制的模型要花这么长时间吗?

0 个答案:

没有答案