Question

我目前正在研究RNN，它应该根据训练样本文本生成文本。但出于某种原因，无论我训练多久，它总是泄漏废话。你可以找到GitHub repo here。

网络定义

## RNN with num_layers LSTM layers and a fully-connected output layer
## The network allows for a dynamic number of iterations, depending on the         inputs it receives.
##
##    out   (fc layer; out_size)
##     ^
##    lstm
##     ^
##    lstm  (lstm size)
##     ^
##     in   (in_size)
class ModelNetwork:
    def __init__(self, in_size, lstm_size, num_layers, out_size, session,     learning_rate=0.003, name="rnn"):
        self.scope = name

        self.in_size = in_size
        self.lstm_size = lstm_size
        self.num_layers = num_layers
        self.out_size = out_size

        self.session = session

        self.learning_rate = tf.constant( learning_rate )

        # Last state of LSTM, used when running the network in TEST mode
        self.lstm_last_state = np.zeros((self.num_layers*2*self.lstm_size,))

        with tf.variable_scope(self.scope):
            ## (batch_size, timesteps, in_size)
            self.xinput = tf.placeholder(tf.float32, shape=(None, None,     self.in_size), name="xinput")
            self.lstm_init_value = tf.placeholder(tf.float32, shape=(None, self.num_layers*2*self.lstm_size), name="lstm_init_value")

        # LSTM
        self.lstm_cells = [ tf.contrib.rnn.BasicLSTMCell(self.lstm_size, forget_bias=1.0, state_is_tuple=False) for i in range(self.num_layers)]
        self.lstm = tf.contrib.rnn.MultiRNNCell(self.lstm_cells, state_is_tuple=False)

        # Iteratively compute output of recurrent network
        outputs, self.lstm_new_state = tf.nn.dynamic_rnn(self.lstm, self.xinput, initial_state=self.lstm_init_value, dtype=tf.float32)

        # Linear activation (FC layer on top of the LSTM net)
        self.rnn_out_W = tf.Variable(tf.random_normal( (self.lstm_size, self.out_size), stddev=0.01 ))
        self.rnn_out_B = tf.Variable(tf.random_normal( (self.out_size, ), stddev=0.01 ))

        outputs_reshaped = tf.reshape( outputs, [-1, self.lstm_size] )
        network_output = ( tf.matmul( outputs_reshaped, self.rnn_out_W ) + self.rnn_out_B )

        batch_time_shape = tf.shape(outputs)
        self.final_outputs = tf.reshape( tf.nn.softmax( network_output), (batch_time_shape[0], batch_time_shape[1], self.out_size) )


        ## Training: provide target outputs for supervised training.
        self.y_batch = tf.placeholder(tf.float32, (None, None, self.out_size))
        y_batch_long = tf.reshape(self.y_batch, [-1, self.out_size])

        self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=network_output, labels=y_batch_long) )
        self.train_op = tf.train.RMSPropOptimizer(self.learning_rate, 0.9).minimize(self.cost)


## Input: X is a single element, not a list!
def run_step(self, x, init_zero_state=True):
    ## Reset the initial state of the network.
    if init_zero_state:
        init_value = np.zeros((self.num_layers*2*self.lstm_size,))
    else:
        init_value = self.lstm_last_state

    out, next_lstm_state = self.session.run([self.final_outputs, self.lstm_new_state], feed_dict={self.xinput:[x], self.lstm_init_value:[init_value]   } )

    self.lstm_last_state = next_lstm_state[0]

    return out[0][0]


## xbatch must be (batch_size, timesteps, input_size)
## ybatch must be (batch_size, timesteps, output_size)
def train_batch(self, xbatch, ybatch):
    init_value = np.zeros((xbatch.shape[0], self.num_layers*2*self.lstm_size))

    cost, _ = self.session.run([self.cost, self.train_op], feed_dict={self.xinput:xbatch, self.y_batch:ybatch, self.lstm_init_value:init_value   } )

    return cost


# Embed string to character-arrays -- it generates an array len(data) x     len(vocab)
# Vocab is a list of elements
def embed_to_vocab(data_, vocab):
    data = np.zeros((len(data_), len(vocab)))

    cnt=0
    for s in data_:
        v = [0.0]*len(vocab)
        v[vocab.index(s)] = 1.0
        data[cnt, :] = v
        cnt += 1

    return data

def decode_embed(array, vocab):
    return vocab[ array.index(1) ]

恢复

    if options.mode == 'test':
    if not options.uid:
        print("Please enter model id, use -u <model id>")
        return
    model_uuid = options.uid
    with tf.Session() as sess_res:
        saver_res = tf.train.import_meta_graph("model/"+model_uuid+"/model.ckpt.meta")
        saver_res.restore(sess, "model/"+model_uuid+"/model.ckpt")

        TEST_PREFIX = TEST_PREFIX.lower()
        for i in range(len(TEST_PREFIX)):
            out = net.run_step( embed_to_vocab(TEST_PREFIX[i], vocab) , i==0)

        gen_str = TEST_PREFIX
        for i in range(LEN_TEST_TEXT):
            element = np.random.choice( range(len(vocab)), p=out ) # Sample character from the network according to the generated output probabilities
            gen_str += vocab[element]

            out = net.run_step( embed_to_vocab(vocab[element], vocab) , False )
        print ('----------------Text----------------')
        print (gen_str)
        print ('----------------End----------------')
        text_file = open("data/output.txt", "w")
        text_file.write(gen_str)
        text_file.close()

培训

    ## Initialize the network
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.InteractiveSession(config=config)

net = ModelNetwork(in_size = in_size,
                    lstm_size = lstm_size,
                    num_layers = num_layers,
                    out_size = out_size,
                    session = sess,
                    learning_rate = learning_rate_prop,
                    name = "char_rnn_network")

sess.run(tf.global_variables_initializer())

saver = tf.train.Saver(tf.global_variables())

last_time = time.time()

batch = np.zeros((batch_size, time_steps, in_size))
batch_y = np.zeros((batch_size, time_steps, in_size))

possible_batch_ids = range(data.shape[0]-time_steps-1)

if options.mode == 'train':
    for i in tqdm(range(NUM_TRAIN_BATCHES)):
        # Sample time_steps consecutive samples from the dataset text file
        batch_id = random.sample( possible_batch_ids, batch_size )
        for j in range(time_steps):
            ind1 = [k+j for k in batch_id]
            ind2 = [k+j+1 for k in batch_id]

            batch[:, j, :] = data[ind1, :]
            batch_y[:, j, :] = data[ind2, :]


        cst = net.train_batch(batch, batch_y)
        if (i%100) == 0:
            new_time = time.time()
            diff = new_time - last_time
            last_time = new_time

            print("batch: ",i,"   loss: ",cst,"   speed: ",(100.0/diff)," batches / s")
    model_uuid = str(uuid.uuid1())
    saver.save(sess, "model/"+model_uuid+"/model.ckpt")
    print("Finished training model, model id: " + model_uuid)

结果

   0%|          | 0/500 [00:00<?, ?it/s]batch:  0    loss:  4.19432    speed:        239.7453419095813  batches / s
   19%|#9        | 96/500 [00:01<00:13, 30.77it/s]batch:  100    loss:        3.9609    speed:  114.48676714604412  batches / s
   38%|###8      | 192/500 [00:02<00:03, 100.71it/s]batch:  200    loss:  3.08484    speed:  116.24018792187363  batches / s
   60%|######    | 300/500 [00:03<00:01, 112.94it/s]batch:  300    loss:        2.65907    speed:  112.51337575482982  batches / s
   79%|#######9  | 396/500 [00:03<00:00, 114.93it/s]batch:  400    loss:        2.29085    speed:  113.07714974572966  batches / s
  100%|##########| 500/500 [00:04<00:00, 104.44it/s]
  Finished training model, model id: 335c3de2-37f9-11e8-a493-4ccc6abbb6f6

假设

我怀疑它训练好了，因为100k批次之后的损失大约是0.9-1。但是所有已保存的 .cpkt 文件都具有相同的大小（无论我训练模型多久）。所有这一切使我得出这样的假设，即由于某些原因它没有保存模型，而是我跟着this关于如何保存和恢复模型的博客文章。

Answer 1

问题在于，当我恢复模型时，我使用网络的最后一个未经训练的图层来预测 net.run_step（）函数中的下一个字符。

TensorFlow | RNN没有培训（保存）

1 个答案: