Question

我使用这样的keras构建了一个seq2seq模型：

def build_bidi_model():
    rnn = layers.LSTM

    # Encoder
    # encoder inputs
    encoder_inputs = Input(shape=(None,), name='encoder_inputs')
    # encoder embedding
    encoder_embedding = Embedding(num_encoder_tokens, encoder_embedding_dim,name='encoder_embedding')(encoder_inputs)
    # encoder lstm
    encoder_lstm = Bidirectional(rnn(latent_dim, return_state=True, dropout=0.2,recurrent_dropout=0.5),name='encoder_lstm')
    _, *encoder_states = bidi_encoder_lstm(encoder_embedding)

    # Decoder
    # decoder inputs
    decoder_inputs = Input(shape=(None,), name='decoder_inputs')
    # decoder embeddding
    decoder_embedding = Embedding(num_decoder_tokens, decoder_embedding_dim, name='decoder_embedding')(decoder_inputs)
    # decoder lstm,
    decoder_lstm = Bidirectional(rnn(latent_dim, return_state=True, 
                       return_sequences=True, dropout=0.2,
                       recurrent_dropout=0.5),name='decoder_lstm')
    # get outputs and decoder states
    rnn_outputs, *decoder_states = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    # decoder dense
    decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(rnn_outputs)

    bidi_model = Model([encoder_inputs,decoder_inputs], [decoder_outputs])
    bidi_model.compile(optimizer='adam', loss='categorical_crossentropy')

    return bidi_model

训练损失和有效性损失真的很低，但是当我尝试用训练模型进行推理时，结果却很糟糕。这是推理代码：

reverse_input_char_index = dict([(i, char) for char, i in input_token_index.items()])
reverse_target_word_index = dict([(i, char) for char, i in target_token_index.items()])

def decode_sequence(input_seq, encoder_model, decoder_model):
    # get encoder states
    states_value = encoder_model.predict(input_seq)

    # create a empty sequence
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_token_index['start']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output, *decoder_states = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output[0, -1, :])
        sampled_word = reverse_target_word_index[sampled_token_index]
        decoded_sentence += sampled_word

        if sampled_word == 'end' or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # update target_seq
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # update states
        states_value = decoder_states

    return decoded_sentence

当我构建另一个在解码器中移除双向层的类似模型时，推理结果非常好，这很奇怪。所以我想知道我的代码是不是错了？请帮忙。

Answer 1

我认为问题在于：对于解码器，不可能使用双向进行推理。由于双向实际上是原始序列，也是序列的反向，因此在推理模式下，您的模型只能预测结果的起始字符，尚不知道终止字符。对于解码器来说，仅馈送起始字符将是错误的输入。进进出出，因此您的模型性能大大降低。

希望这很有道理。

在seq2seq模型中使用bidirectianl解码器时如何进行推理？

1 个答案: