我尝试使用tensorflow来完成基本的seq2seq模型,以便在没有tf.contrib存在模块的情况下汇总文档。在我完成这个模型并开始训练之后,我发现成本变得越来越大,最后变成了南。这是一个非常奇怪的现象,因为我使用了限幅梯度优化器来优化成本。该模型的预测也很奇怪,每个预测序列与前一个相同。任何人都可以帮助我,非常感谢你! 这是日志:
Epoch: 1, Batch: 9660, Sample: 309088, Loss: 10501.7
Epoch: 1, Batch: 9661, Sample: 309120, Loss: 12208.6
Epoch: 1, Batch: 9662, Sample: 309152, Loss: 10829.7
Epoch: 1, Batch: 9663, Sample: 309184, Loss: nan
Epoch: 1, Batch: 9664, Sample: 309216, Loss: nan
Title: ['<START>', 'Seahawks', 'sign', 'former', 'Minnesota', 'kicker', 'Blair', 'Walsh', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END >', '<END>']
Pred: ['<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '< START>', '<START>', '<START>', '<START>']
Title: ['<START>', 'Redskins', 'WR', 'DeSean', 'Jackson', 'Packers', 'LB', 'Clay', 'Matthews', 'active', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>']
Pred: ['<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '< START>', '<START>', '<START>', '<START>']
这是我的张量流代码:
def model(argv):
batch_size = 32
vocab_size = 100000
emb_size = 200
input_words_size = 500
output_words_size = 20
min_input_size = 100
encoder_hidden_unit = 512
encoder_rnn_layer = 3
encoder_keep_prob = 1.0
decoder_hidden_unit = 512
decoder_rnn_layer = 3
decoder_keep_prob = 1.0
graph = tf.Graph()
with graph.as_default():
with tf.device('/cpu:0'):
with tf.name_scope('input_layer'):
embedding_placeholder = tf.placeholder(
tf.float32, [vocab_size, emb_size], 'emb_holder')
input_idx_placeholder = tf.placeholder(
tf.int64, [None, input_words_size], 'input_idx')
output_idx_placeholder = tf.placeholder(
tf.int64, [None, output_words_size], 'output_idx')
output_idx_swap_placeholder = tf.placeholder(
tf.int64, [output_words_size-1, None], 'output_idx_swap')
masking_placeholder = tf.placeholder(
tf.float32, [output_words_size-1, None], 'masking'
)
with tf.name_scope('pre-train_embedding'):
emb_weight = tf.Variable(tf.constant(
0.0, shape=[vocab_size, emb_size]), trainable=False, name='emb_W')
embedding_init = emb_weight.assign(embedding_placeholder)
with tf.name_scope('embedding_layer'):
encoder_embedding_out = tf.nn.embedding_lookup(emb_weight, input_idx_placeholder)
decoder_embedding_out = tf.nn.embedding_lookup(emb_weight, output_idx_placeholder)
with tf.name_scope('gen_decoder_input_list'):
target_label = tf.unstack(output_idx_swap_placeholder)
target_masking = tf.unstack(masking_placeholder)
with tf.device('/gpu:0'):
with tf.name_scope('encoder'):
encoder_rnn_cell = tf.contrib.rnn.GRUCell(encoder_hidden_unit)
encoder_rnn_cell = tf.contrib.rnn.DropoutWrapper(
encoder_rnn_cell, output_keep_prob=encoder_keep_prob)
encoder = tf.contrib.rnn.MultiRNNCell([encoder_rnn_cell] * encoder_rnn_layer)
encoder_init_state = encoder.zero_state(batch_size, tf.float32)
encoder_outputs = []
encoder_state = encoder_init_state
with tf.variable_scope('encoder_rnn'):
for i in xrange(input_words_size):
if i > 0:
tf.get_variable_scope().reuse_variables()
(rnn_output, encoder_state) = encoder(encoder_embedding_out[:, i, :], encoder_state)
encoder_outputs.append(rnn_output)
# encoder_output = encoder_outputs[-1]
encoder_final_state = encoder_state
with tf.device('/cpu:0'):
with tf.name_scope('decoder'):
decoder_b = tf.get_variable(
"decoder_bias", shape=[vocab_size], initializer=tf.contrib.layers.xavier_initializer())
decoder_weight = tf.get_variable(
"decoder_weight",
shape=[decoder_hidden_unit, vocab_size],
initializer=tf.contrib.layers.xavier_initializer())
decoder_rnn_cell = tf.contrib.rnn.GRUCell(decoder_hidden_unit)
decoder_rnn_cell = tf.contrib.rnn.DropoutWrapper(decoder_rnn_cell,
output_keep_prob=decoder_keep_prob)
decoder = tf.contrib.rnn.MultiRNNCell([decoder_rnn_cell] * decoder_rnn_layer)
# decoder_init_state = decoder.zero_state(batch_size, tf.float32)
decoder_init_state = encoder_final_state
decoder_outputs = []
decoder_state = decoder_init_state
decoder_pred_state = decoder_init_state
decoder_pred_outputs = []
result = []
with tf.variable_scope('decoder_rnn'):
for i in xrange(output_words_size):
if i > 0:
tf.get_variable_scope().reuse_variables()
(rnn_output, decoder_state) = decoder(decoder_embedding_out[:, i, :], decoder_state)
decoder_outputs.append(rnn_output)
for i in xrange(output_words_size):
tf.get_variable_scope().reuse_variables()
if i == 0:
(pred_output, decoder_pred_state) = decoder(
decoder_embedding_out[:, i, :], decoder_pred_state)
else:
(pred_output, decoder_pred_state) = decoder(decoder_pred_outputs[-1], decoder_pred_state)
pred_logit = tf.matmul(pred_output, decoder_weight) + decoder_b
pred_result = tf.argmax(pred_logit, 1)
result.append(pred_result)
pred_emb = tf.nn.embedding_lookup(emb_weight, pred_result)
decoder_pred_outputs.append(pred_emb)
with tf.device('/gpu:0'):
with tf.name_scope('train_gpu'):
train_seq = decoder_outputs[:-1]
train_logits = [tf.matmul(i, decoder_weight) + decoder_b for i in train_seq]
log_prep_list = []
for logit, target, masking in zip(train_logits, target_label, target_masking):
target = tf.reshape(target, [-1])
masking = tf.reshape(masking, [-1])
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=target, logits=logit
)
log_prep_list.append(crossent * masking)
log_preps = tf.add_n(log_prep_list)
cost = tf.reduce_sum(log_preps)
# train = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
gvs = optimizer.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
train = optimizer.apply_gradients(capped_gvs)
# target_seq = decoder_embedding_out[:, 1:, :]
with tf.name_scope('initial'):
init = tf.global_variables_initializer()
with tf.Session(graph=graph) as sess:
vector_file = argv[1]
rank_file = argv[2]
emb_size = int(argv[3])
train_file = argv[4]
epoch = int(argv[5])
valid_list = gen_top_N_list(vocab_size, rank_file)
word_idx, idx_word, emb_mat = gen_emb_mat(vector_file, emb_size, valid_list)
sess.run(init)
sess.run(embedding_init, feed_dict={embedding_placeholder: emb_mat})
del emb_mat
for ep in xrange(epoch):
f = open(train_file, 'r')
cnt = 0
train_content = []
train_title = []
train_masking = []
for line in f:
data = json.loads(line.strip())
title_list = tokenize(data['title'])
content_list = tokenize(data['content'])
if len(title_list) > output_words_size - 2 \
or len(content_list) > input_words_size \
or len(content_list) < min_input_size:
continue
title_list.insert(0, '<START>')
title_masking = [1.0] * (len(title_list) + 1)
title_masking.extend([0.0] * (output_words_size - 1 - len(title_list)))
title_list.extend(['<END>'] * (output_words_size - len(title_list)))
content_list.extend(['<END>'] * (input_words_size - len(content_list)))
train_content.append([word_idx.get(i, word_idx['<unk>']) for i in content_list])
train_title.append([word_idx.get(i, word_idx['<unk>']) for i in title_list])
train_masking.append(title_masking)
cnt += 1
if cnt % batch_size == 0:
batch = cnt / batch_size
batch_train_content = np.array(train_content)
batch_train_title = np.array(train_title)
batch_swap_train_title = batch_train_title.swapaxes(0, 1)[1:]
batch_train_masking = np.array(train_masking).swapaxes(0, 1)[1:]
if batch % 100 == 0:
loss, res, _ = sess.run(
[cost, result, train], feed_dict={
input_idx_placeholder: batch_train_content,
output_idx_placeholder: batch_train_title,
output_idx_swap_placeholder: batch_swap_train_title,
masking_placeholder: batch_train_masking})
out = np.array(res).swapaxes(0, 1).tolist()
for i in xrange(len(out)):
label_seq = [idx_word[j] for j in batch_train_title[i].tolist()]
pred = [idx_word[j] for j in out[i]]
print "Title: " + str(label_seq) + "\nPred: " + str(pred)
else:
loss, _ = sess.run(
[cost, train], feed_dict={
input_idx_placeholder: batch_train_content,
output_idx_placeholder: batch_train_title,
output_idx_swap_placeholder: batch_swap_train_title,
masking_placeholder: batch_train_masking})
print 'Epoch: ' + str(ep + 1) + ', Batch: ' + str(batch + 1) + \
', Sample: ' + str(cnt) + ', Loss: ' + str(loss)
train_content = []
train_title = []
train_masking = []
f.close()