如何使用编码器-解码器lstm正确生成序列?

时间:2019-07-05 00:53:23

标签: keras lstm seq2seq data-generation encoder-decoder

我正在实现一些代码,以从“使用用于语义槽填充的编码器-解码器LSTM生成标签数据”(https://pdfs.semanticscholar.org/7ffe/83d7dd3a474e15ccc2aef412009f100a5802.pdf)一文中生成一些代码,以用于自然语言理解(NLU)。我的体系结构是一个简单的编解码器LSTM,但是由于我生成的句子(针对单词和标签)不正确,因此我试图生成与输入相同的句子(仅单词)。不幸的是,这不能正常工作。

我正在使用vord2vec进行单词嵌入,并且嵌入的尺寸设置为64(如文章中所建议)。编码器LSTM以相反的顺序接收序列,且丢失率为0.5。解码器LSTM对于序列的每个输出也具有0.5的丢失率和softmax层,以映射最可能的字。输入与目标(相同的句子)完全相同,因为首先我要产生完全相同的句子。

对于训练,我使用了Adam优化器和categorical_crossentropy进行了损失。为了进行推断,我在生成序列时使用了波束搜索(B = 3)。

我的训练代码:

def pretrained_embedding_layer(emb):
    vocab_len = len(emb)
    emb_dim = len(emb[0])
    emb_layer = Embedding(vocab_len, emb_dim, trainable = False)
    emb_layer.build((None,))
    emb_layer.set_weights([emb])

    return emb_layer

LSTM_encoder = LSTM(1024, dropout=0.5, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, dropout=0.5, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")

K.set_learning_phase(1)

def model1_enc_dec(input_shape, w_emb):
     words_indices = Input(shape=input_shape, dtype='int32')
     wemb_layer = pretrained_embedding_layer(w_emb)
     wemb = wemb_layer(words_indices)
     enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
     encoder_states = [enc_state_h, enc_state_c]
     dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
     initial_state=encoder_states)
     dec_out = dense_w(dec_out)
     model1 = Model(inputs=[words_indices], outputs=[dec_out])

     return model1

model = model1_enc_dec((maxlen,), w_emb, s_emb)
model.summary()
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
    model.fit(train_w, train_lab_w, validation_data=(val_w, val_lab_w), epochs=epochs, verbose=1, shuffle=True)

我的推断代码:

wemb_layer = Embedding(len(w_emb), len(w_emb[0]), trainable=False)
wemb_layer.build((None,))
LSTM_encoder = LSTM(1024, return_state=True, go_backwards=True, name='lstm_encoder')
LSTM_decoder = LSTM(1024, return_sequences=True, return_state=True, name='lstm_decoder')
dense_w = Dense(vocab_w_size, activation='softmax', name="word_output")

def target_model(input_shape):
    words_indices = Input(shape=input_shape, dtype='int32')
    wemb = wemb_layer(words_indices)
    enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
    encoder_states = [enc_state_h, enc_state_c]
    dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
    initial_state=encoder_states)
    dec_out = dense_w(dec_out)
    model = Model(inputs=[words_indices], outputs=[dec_out])

    return model

target_model = target_model((maxlen,))
wemb_layer.set_weights(model1.layers[1].get_weights()) # layer 0: input
LSTM_encoder.set_weights(model1.layers[2].get_weights())
LSTM_decoder.set_weights(model1.layers[3].get_weights())
dense_w.set_weights(model1.layers[4].get_weights())

def model1_enco_infe(input_shape):
    words_indices = Input(shape=input_shape, dtype='int32')
    wemb = wemb_layer(words_indices)
    enc_out, enc_state_h, enc_state_c = LSTM_encoder(wemb)
    encoder_model = Model(inputs=[words_indices], outputs=[enc_state_h, 
    enc_state_c])

    return encoder_model

def model1_deco_infe(input_shape):
    dec_word_input = Input(shape=input_shape, dtype='int32')
    dec_state_input_h = Input(shape=(1024,))
    dec_state_input_c = Input(shape=(1024,))
    wemb = wemb_layer(dec_word_input)
    dec_states_input = [dec_state_input_h, dec_state_input_c]
    dec_out, dec_state_h, dec_state_c = LSTM_decoder(wemb, 
    initial_state=dec_states_input)
    dec_states_output = [dec_state_h, dec_state_c]
    deco_out = dense_w(dec_out)
    decoder_model = Model(inputs=[dec_word_input] + dec_states_input, outputs= 
    [deco_out] + dec_states_output)

    return decoder_model

encoder_model = model1_enco_infe((maxlen,))
decoder_model = model1_deco_infe((1,))

def beamsearch_B(deco_w_out, beam):
    words_index = []
    dw = deco_w_out.copy()
    for i in range(beam):
        word_index = np.argmax(dw, axis=-1)
        dw[0][0][word_index[0][0]] = 0
        words_index.append(word_index[0][0])

    return words_index

def generate_model1_add(word_seq, encoder_model, decoder_model, dec_word_input, id2word, beam):
    [enc_state_h, enc_state_c] = encoder_model.predict(word_seq)
    states = [enc_state_h, enc_state_c]
    word_sentence = ''
    probs_word = []
    word_sentences = []
    dec_word_inputs = []
    states_beam = []
    stop_condition = False

    [dec_w_out, dec_state_h, dec_state_c] = 
    decoder_model.predict([dec_word_input] + states)
    words_index, _ = beamsearch_B(dec_w_out, [], beam)

    for i in range(beam):
        probs_word.append(-log(dec_w_out[0][0][words_index[i]]))
        word_sentences.append(id2word[words_index[i]])
        dec_word_inputs.append([words_index[i]])
        states_beam.append([dec_state_h, dec_state_c])

        n_words = 1
        endgame = []

        while not stop_condition:
            words_indexes, words_sentences, probs_words, states_b = [], [], 
            [], []
            for k in range(beam):
                [dec_w_out, dec_state_h, dec_state_c] = 
                decoder_model.predict([dec_word_inputs[k]] + states_beam[k])
                words_index, _ = beamsearch_B(dec_w_out, [], beam)
                states = [dec_state_h, dec_state_c]

                for j in range(beam):
                    words_indexes.append(words_index[j])
                    probs_words.append(probs_word[k] * -log(dec_w_out[0][0] 
                    [words_index[j]]) + 1e-7)
                    words_sentences.append(word_sentences[k] + ' ' + 
                    id2word[words_index[j]])
                    states_b.append(states)

            probs = []
            for i in range(len(probs_words)):
                probs.append(1 / (probs_words[i]))
            indexes = []
            for i in range(beam):
                index = np.argmax(probs, axis=-1)
                probs[index] = 0
                indexes.append(index)

            for i in range(beam):
                probs_word[i] = probs_words[indexes[i]]
                word_sentences[i] = words_sentences[indexes[i]]
                dec_word_inputs[i] = [words_indexes[indexes[i]]]
                states_beam[i] = states_b[indexes[i]]
                if (id2word[words_indexes[indexes[i]]] == 'EOS'):
                    endgame.append(i)

            if len(endgame) == 1:
                word_sentence = word_sentences[endgame]
                stop_condition = True
            elif len(endgame) > 1:
                word_sentence = word_sentences[np.min(endgame)]
                stop_condition = True

            n_words += 1

            if n_words > 50:
                word_sentence = word_sentences[0]
                stop_condition = True

    return word_sentence

word_sentence = generate_model1_add(np.reshape(train_w[i], (1, maxlen)), 
                encoder_model, 0, decoder_model, [w2i['BOS']], i2w, 3)

我生成的序列的示例:

输入句子:BOS是我从波士顿飞往亚特兰大的第四次EOS PAD PAD PAD ... 生成的句子:BOS from from from from from from from from from from from from from from from from from from from ...

训练权重似乎不正确,但是我在训练过程中损失了0.0032-acc:0.9990-val_loss:0.0794-val_acc:0.9888。

我想要的只是生成与输入完全相同的句子。希望你能帮助我。预先谢谢你!

0 个答案:

没有答案