我正在尝试训练一种编码器-解码器模型,以实现从英语到印地语的翻译。我的数据集包含10240对句子,训练测试的拆分是80-20。我正在对输出和输入进行如下编码-
def gen_batch(trainX, trainY, batch_size):
while True:
encoder_inputs = np.zeros(shape=(batch_size, max_length_src), dtype="float32")
decoder_inputs = np.zeros(shape=(batch_size, max_length_tar), dtype="float32")
one_hot_decoder_output = np.zeros(shape=(batch_size, max_length_tar, hin_vocab_size), dtype="float32")
for t in range(0, len(trainX), batch_size):
for i,sentence in enumerate(trainX[t:t+batch_size]):
for j, word in enumerate(sentence.split()):
encoder_inputs[i, j] = eng_char_token[word]
for i,sentence in enumerate(trainY[t:t+batch_size]):
for j, word in enumerate(sentence.split()):
decoder_inputs[i, j] = hin_char_token[word]
if j>0:
one_hot_decoder_input[i, j - 1, hin_char_token[word]] = 1
yield ([encoder_inputs, decoder_inputs], one_hot_decoder_output)
以下是使用的模型-
latent_dim = 50
encoder_inp = Input(shape=(None,))
encoder_embedding = Embedding(eng_vocab_size, latent_dim, mask_zero=True)(encoder_inp)
encoder_lstm, state_h, state_c = LSTM(latent_dim,
return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]
# Set up the decoder, using `encoder_states` as initial state.
decoder_inp = Input(shape=(None,))
decoder_embedding = Embedding(hin_vocab_size, latent_dim, mask_zero=True)
decoder_embedding_output = decoder_embedding(decoder_inp)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_lstm_ouput, _, _ = decoder_lstm(decoder_embedding_output, initial_state=encoder_states)
decoder_dense = Dense(hin_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm_ouput)
model = Model([encoder_inp, decoder_inp], decoder_outputs)
optimizer = RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
和模型一样训练-
batch_size = 128
filename = 'saved- epoch:{epoch} and val_loss:{val_loss}'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit_generator(generator=gen_batch(train["english_sentence"], train["hindi_sentence"], batch_size),
steps_per_epoch=len(train)//batch_size,
validation_data = gen_batch(test["english_sentence"], test["hindi_sentence"], batch_size),
validation_steps = len(test)//batch_size,
# callbacks=[checkpoint],
epochs=200)
尽管训练损失减少而验证损失增加,但训练损失曲线仍保持在验证损失曲线之上。我无法理解其背后的原因,也无法从中得到很好的预测。