Question

我尝试使用tensorflow来完成基本的seq2seq模型，以便在没有tf.contrib存在模块的情况下汇总文档。在我完成这个模型并开始训练之后，我发现成本变得越来越大，最后变成了南。这是一个非常奇怪的现象，因为我使用了限幅梯度优化器来优化成本。该模型的预测也很奇怪，每个预测序列与前一个相同。任何人都可以帮助我，非常感谢你！这是日志：

Epoch: 1, Batch: 9660, Sample: 309088, Loss: 10501.7
Epoch: 1, Batch: 9661, Sample: 309120, Loss: 12208.6
Epoch: 1, Batch: 9662, Sample: 309152, Loss: 10829.7
Epoch: 1, Batch: 9663, Sample: 309184, Loss: nan
Epoch: 1, Batch: 9664, Sample: 309216, Loss: nan


Title: ['<START>', 'Seahawks', 'sign', 'former', 'Minnesota', 'kicker', 'Blair', 'Walsh', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END      >', '<END>']
Pred: ['<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<      START>', '<START>', '<START>', '<START>']
Title: ['<START>', 'Redskins', 'WR', 'DeSean', 'Jackson', 'Packers', 'LB', 'Clay', 'Matthews', 'active', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>', '<END>',       '<END>']
Pred: ['<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<START>', '<      START>', '<START>', '<START>', '<START>']

这是我的张量流代码：

def model(argv):
batch_size = 32
vocab_size = 100000
emb_size = 200
input_words_size = 500
output_words_size = 20
min_input_size = 100
encoder_hidden_unit = 512
encoder_rnn_layer = 3
encoder_keep_prob = 1.0
decoder_hidden_unit = 512
decoder_rnn_layer = 3
decoder_keep_prob = 1.0
graph = tf.Graph()
with graph.as_default():
    with tf.device('/cpu:0'):
        with tf.name_scope('input_layer'):
            embedding_placeholder = tf.placeholder(
                tf.float32, [vocab_size, emb_size], 'emb_holder')
            input_idx_placeholder = tf.placeholder(
                tf.int64, [None, input_words_size], 'input_idx')
            output_idx_placeholder = tf.placeholder(
                tf.int64, [None, output_words_size], 'output_idx')
            output_idx_swap_placeholder = tf.placeholder(
                tf.int64, [output_words_size-1, None], 'output_idx_swap')
            masking_placeholder = tf.placeholder(
                tf.float32, [output_words_size-1, None], 'masking'
            )

        with tf.name_scope('pre-train_embedding'):
            emb_weight = tf.Variable(tf.constant(
                0.0, shape=[vocab_size, emb_size]), trainable=False, name='emb_W')
            embedding_init = emb_weight.assign(embedding_placeholder)

        with tf.name_scope('embedding_layer'):
            encoder_embedding_out = tf.nn.embedding_lookup(emb_weight, input_idx_placeholder)
            decoder_embedding_out = tf.nn.embedding_lookup(emb_weight, output_idx_placeholder)

        with tf.name_scope('gen_decoder_input_list'):
            target_label = tf.unstack(output_idx_swap_placeholder)
            target_masking = tf.unstack(masking_placeholder)

    with tf.device('/gpu:0'):
        with tf.name_scope('encoder'):
            encoder_rnn_cell = tf.contrib.rnn.GRUCell(encoder_hidden_unit)
            encoder_rnn_cell = tf.contrib.rnn.DropoutWrapper(
                encoder_rnn_cell, output_keep_prob=encoder_keep_prob)
            encoder = tf.contrib.rnn.MultiRNNCell([encoder_rnn_cell] * encoder_rnn_layer)
            encoder_init_state = encoder.zero_state(batch_size, tf.float32)
            encoder_outputs = []
            encoder_state = encoder_init_state
            with tf.variable_scope('encoder_rnn'):
                for i in xrange(input_words_size):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    (rnn_output, encoder_state) = encoder(encoder_embedding_out[:, i, :], encoder_state)
                    encoder_outputs.append(rnn_output)
            # encoder_output = encoder_outputs[-1]
            encoder_final_state = encoder_state

    with tf.device('/cpu:0'):
        with tf.name_scope('decoder'):
            decoder_b = tf.get_variable(
                "decoder_bias", shape=[vocab_size], initializer=tf.contrib.layers.xavier_initializer())
            decoder_weight = tf.get_variable(
                "decoder_weight",
                shape=[decoder_hidden_unit, vocab_size],
                initializer=tf.contrib.layers.xavier_initializer())
            decoder_rnn_cell = tf.contrib.rnn.GRUCell(decoder_hidden_unit)
            decoder_rnn_cell = tf.contrib.rnn.DropoutWrapper(decoder_rnn_cell,
                                                             output_keep_prob=decoder_keep_prob)
            decoder = tf.contrib.rnn.MultiRNNCell([decoder_rnn_cell] * decoder_rnn_layer)
            # decoder_init_state = decoder.zero_state(batch_size, tf.float32)
            decoder_init_state = encoder_final_state
            decoder_outputs = []
            decoder_state = decoder_init_state
            decoder_pred_state = decoder_init_state
            decoder_pred_outputs = []
            result = []
            with tf.variable_scope('decoder_rnn'):
                for i in xrange(output_words_size):
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()
                    (rnn_output, decoder_state) = decoder(decoder_embedding_out[:, i, :], decoder_state)
                    decoder_outputs.append(rnn_output)
                for i in xrange(output_words_size):
                    tf.get_variable_scope().reuse_variables()
                    if i == 0:
                        (pred_output, decoder_pred_state) = decoder(
                            decoder_embedding_out[:, i, :], decoder_pred_state)
                    else:
                        (pred_output, decoder_pred_state) = decoder(decoder_pred_outputs[-1], decoder_pred_state)
                    pred_logit = tf.matmul(pred_output, decoder_weight) + decoder_b
                    pred_result = tf.argmax(pred_logit, 1)
                    result.append(pred_result)
                    pred_emb = tf.nn.embedding_lookup(emb_weight, pred_result)
                    decoder_pred_outputs.append(pred_emb)

    with tf.device('/gpu:0'):
        with tf.name_scope('train_gpu'):
            train_seq = decoder_outputs[:-1]
            train_logits = [tf.matmul(i, decoder_weight) + decoder_b for i in train_seq]
            log_prep_list = []
            for logit, target, masking in zip(train_logits, target_label, target_masking):
                target = tf.reshape(target, [-1])
                masking = tf.reshape(masking, [-1])
                crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=target, logits=logit
                )
                log_prep_list.append(crossent * masking)
            log_preps = tf.add_n(log_prep_list)
            cost = tf.reduce_sum(log_preps)
            # train = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
            optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
            gvs = optimizer.compute_gradients(cost)
            capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
            train = optimizer.apply_gradients(capped_gvs)
            # target_seq = decoder_embedding_out[:, 1:, :]

        with tf.name_scope('initial'):
            init = tf.global_variables_initializer()

with tf.Session(graph=graph) as sess:
    vector_file = argv[1]
    rank_file = argv[2]
    emb_size = int(argv[3])
    train_file = argv[4]
    epoch = int(argv[5])
    valid_list = gen_top_N_list(vocab_size, rank_file)
    word_idx, idx_word, emb_mat = gen_emb_mat(vector_file, emb_size, valid_list)

    sess.run(init)
    sess.run(embedding_init, feed_dict={embedding_placeholder: emb_mat})
    del emb_mat
    for ep in xrange(epoch):
        f = open(train_file, 'r')
        cnt = 0
        train_content = []
        train_title = []
        train_masking = []
        for line in f:
            data = json.loads(line.strip())
            title_list = tokenize(data['title'])
            content_list = tokenize(data['content'])
            if len(title_list) > output_words_size - 2 \
                    or len(content_list) > input_words_size \
                    or len(content_list) < min_input_size:
                continue
            title_list.insert(0, '<START>')
            title_masking = [1.0] * (len(title_list) + 1)
            title_masking.extend([0.0] * (output_words_size - 1 - len(title_list)))
            title_list.extend(['<END>'] * (output_words_size - len(title_list)))
            content_list.extend(['<END>'] * (input_words_size - len(content_list)))
            train_content.append([word_idx.get(i, word_idx['<unk>']) for i in content_list])
            train_title.append([word_idx.get(i, word_idx['<unk>']) for i in title_list])
            train_masking.append(title_masking)
            cnt += 1
            if cnt % batch_size == 0:
                batch = cnt / batch_size
                batch_train_content = np.array(train_content)
                batch_train_title = np.array(train_title)
                batch_swap_train_title = batch_train_title.swapaxes(0, 1)[1:]
                batch_train_masking = np.array(train_masking).swapaxes(0, 1)[1:]
                if batch % 100 == 0:
                    loss, res, _ = sess.run(
                        [cost, result, train], feed_dict={
                            input_idx_placeholder: batch_train_content,
                            output_idx_placeholder: batch_train_title,
                            output_idx_swap_placeholder: batch_swap_train_title,
                            masking_placeholder: batch_train_masking})
                    out = np.array(res).swapaxes(0, 1).tolist()
                    for i in xrange(len(out)):
                        label_seq = [idx_word[j] for j in batch_train_title[i].tolist()]
                        pred = [idx_word[j] for j in out[i]]
                        print "Title: " + str(label_seq) + "\nPred: " + str(pred)
                else:
                    loss, _ = sess.run(
                        [cost, train], feed_dict={
                            input_idx_placeholder: batch_train_content,
                            output_idx_placeholder: batch_train_title,
                            output_idx_swap_placeholder: batch_swap_train_title,
                            masking_placeholder: batch_train_masking})
                    print 'Epoch: ' + str(ep + 1) + ', Batch: ' + str(batch + 1) + \
                          ', Sample: ' + str(cnt) + ', Loss: ' + str(loss)
                train_content = []
                train_title = []
                train_masking = []
        f.close()

为什么seq2seq的成本变得越来越大

0 个答案: