转换时lstm的损失增加

时间:2018-06-29 19:27:24

标签: tensorflow machine-learning lstm recurrent-neural-network

the loss

当我使用tensorflow转换我的lstm时:几次传票后损失增加,我检查了代码,但找不到解决方案:

    def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
        stack_drop=[]
        for i in range(num_layers):
            lstm = tf.nn.rnn_cell.GRUCell(lstm_size)
            # drop = tf.nn.rnn_cell.DropoutWrapper(lstm_size, output_keep_prob=keep_prob)
            # stack_drop.append(drop)
            stack_drop.append(lstm)
        cell = tf.nn.rnn_cell.MultiRNNCell(stack_drop, state_is_tuple = True)
        initial_state = cell.zero_state(batch_size, tf.float32)

        return cell, initial_state



    # In[11]:

    def build_output(lstm_output, in_size, out_size):

        #seq_output = tf.concat(1, lstm_output) # tf.concat(concat_dim, values)
        # in_size: the lstm layers outpt
        # out_size: should be 1*2 dimension
        seq_output = tf.concat_dim(lstm_output, 1)
        # reshape
        x = tf.reshape(seq_output, [-1, in_size])

        with tf.variable_scope('softmax'):
            softmax_w = tf.Variable(tf.truncated_normal([in_size, out_size], stddev=1.))
            softmax_b = tf.Variable(tf.zeros(out_size))


        logits = tf.matmul(x, softmax_w) + softmax_b


        out = tf.nn.softmax(logits, name='predictions')

        return out, logits




    # In[12]:

    def build_loss(logits, targets, lstm_size, num_classes):
        y_one_hot = tf.one_hot(targets, num_classes)
        y_reshaped = tf.reshape(y_one_hot, logits.get_shape())

        # Softmax cross entropy loss
        loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped)
        loss = tf.reduce_mean(loss)

        return loss




    def build_optimizer(loss, learning_rate, grad_clip):
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
        train_op = tf.train.AdamOptimizer(learning_rate)
        optimizer = train_op.apply_gradients(zip(grads, tvars))

        return optimizer


    # In[14]:

    class CharRNN:

        def __init__(self, num_classes, batch_size=64, num_steps=50, 
                           lstm_size=2*len_arr, num_layers=1, learning_rate=0.001, 
                           grad_clip=5, sampling=False):

            if sampling == True:
                batch_size, num_steps = 1, 1
            else:
                batch_size, num_steps = batch_size, num_steps

            tf.reset_default_graph()

            self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)


            cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)


            x_one_hot = tf.one_hot(self.inputs, num_classes)


            outputs, state = tf.nn.dynamic_rnn(cell,x_one_hot, initial_state=self.initial_state)
            self.final_state = state


            self.prediction, self.logits = build_output(outputs, lstm_size, num_classes)

            self.loss = build_loss(self.logits, self.targets, lstm_size, num_classes)
            self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)




    batch_size = 500         # Sequences per batch
    num_steps = len_arr          # Number of sequence steps per batch
    lstm_size = 100         # Size of hidden layers in LSTMs
    num_layers = 3          # Number of LSTM layers
    learning_rate = 0.001    # Learning rate
    keep_prob = 0.8         # Dropout keep probability


    # In[16]:

    epochs =300
    save_every_n = 50

    model = CharRNN(len(spin), batch_size=batch_size, num_steps=num_steps,
                    lstm_size=lstm_size, num_layers=num_layers, 
                    learning_rate=learning_rate)

    saver = tf.train.Saver(max_to_keep=2)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        counter = 0
        for e in range(epochs):
            # Train network
            new_state = sess.run(model.initial_state)
            loss = 0
            for x, y in get_batches(encoded, batch_size, num_steps):
                counter += 1
                start = time.time()
                feed = {model.inputs: x,
                        model.targets: y,
                        model.keep_prob: keep_prob,
                        model.initial_state: new_state}
                batch_loss, new_state, _ = sess.run([model.loss, 
                                                     model.final_state, 
                                                     model.optimizer], 
                                                     feed_dict=feed)

                end = time.time()
                # control the print lines
                if counter % 1000 == 0:
                    print('epoch: {}/{}... '.format(e+1, epochs),
                          'steps: {}... '.format(counter),
                          'loss: {:.4f}... '.format(batch_loss),
                          '{:.4f} sec/batch'.format((end-start)))

                #if (counter % save_every_n == 0):
                #    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))        
        saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

损失由grad_clip定义,我使用了softmax_cross_entropy,我认为“运行”会话可能有问题吗? 还是我应该尝试其他超参数,然后再次查看它的损失?

0 个答案:

没有答案