我目前正在研究RNN,它应该根据训练样本文本生成文本。但出于某种原因,无论我训练多久,它总是泄漏废话。你可以找到GitHub repo here。
网络定义
## RNN with num_layers LSTM layers and a fully-connected output layer
## The network allows for a dynamic number of iterations, depending on the inputs it receives.
##
## out (fc layer; out_size)
## ^
## lstm
## ^
## lstm (lstm size)
## ^
## in (in_size)
class ModelNetwork:
def __init__(self, in_size, lstm_size, num_layers, out_size, session, learning_rate=0.003, name="rnn"):
self.scope = name
self.in_size = in_size
self.lstm_size = lstm_size
self.num_layers = num_layers
self.out_size = out_size
self.session = session
self.learning_rate = tf.constant( learning_rate )
# Last state of LSTM, used when running the network in TEST mode
self.lstm_last_state = np.zeros((self.num_layers*2*self.lstm_size,))
with tf.variable_scope(self.scope):
## (batch_size, timesteps, in_size)
self.xinput = tf.placeholder(tf.float32, shape=(None, None, self.in_size), name="xinput")
self.lstm_init_value = tf.placeholder(tf.float32, shape=(None, self.num_layers*2*self.lstm_size), name="lstm_init_value")
# LSTM
self.lstm_cells = [ tf.contrib.rnn.BasicLSTMCell(self.lstm_size, forget_bias=1.0, state_is_tuple=False) for i in range(self.num_layers)]
self.lstm = tf.contrib.rnn.MultiRNNCell(self.lstm_cells, state_is_tuple=False)
# Iteratively compute output of recurrent network
outputs, self.lstm_new_state = tf.nn.dynamic_rnn(self.lstm, self.xinput, initial_state=self.lstm_init_value, dtype=tf.float32)
# Linear activation (FC layer on top of the LSTM net)
self.rnn_out_W = tf.Variable(tf.random_normal( (self.lstm_size, self.out_size), stddev=0.01 ))
self.rnn_out_B = tf.Variable(tf.random_normal( (self.out_size, ), stddev=0.01 ))
outputs_reshaped = tf.reshape( outputs, [-1, self.lstm_size] )
network_output = ( tf.matmul( outputs_reshaped, self.rnn_out_W ) + self.rnn_out_B )
batch_time_shape = tf.shape(outputs)
self.final_outputs = tf.reshape( tf.nn.softmax( network_output), (batch_time_shape[0], batch_time_shape[1], self.out_size) )
## Training: provide target outputs for supervised training.
self.y_batch = tf.placeholder(tf.float32, (None, None, self.out_size))
y_batch_long = tf.reshape(self.y_batch, [-1, self.out_size])
self.cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=network_output, labels=y_batch_long) )
self.train_op = tf.train.RMSPropOptimizer(self.learning_rate, 0.9).minimize(self.cost)
## Input: X is a single element, not a list!
def run_step(self, x, init_zero_state=True):
## Reset the initial state of the network.
if init_zero_state:
init_value = np.zeros((self.num_layers*2*self.lstm_size,))
else:
init_value = self.lstm_last_state
out, next_lstm_state = self.session.run([self.final_outputs, self.lstm_new_state], feed_dict={self.xinput:[x], self.lstm_init_value:[init_value] } )
self.lstm_last_state = next_lstm_state[0]
return out[0][0]
## xbatch must be (batch_size, timesteps, input_size)
## ybatch must be (batch_size, timesteps, output_size)
def train_batch(self, xbatch, ybatch):
init_value = np.zeros((xbatch.shape[0], self.num_layers*2*self.lstm_size))
cost, _ = self.session.run([self.cost, self.train_op], feed_dict={self.xinput:xbatch, self.y_batch:ybatch, self.lstm_init_value:init_value } )
return cost
# Embed string to character-arrays -- it generates an array len(data) x len(vocab)
# Vocab is a list of elements
def embed_to_vocab(data_, vocab):
data = np.zeros((len(data_), len(vocab)))
cnt=0
for s in data_:
v = [0.0]*len(vocab)
v[vocab.index(s)] = 1.0
data[cnt, :] = v
cnt += 1
return data
def decode_embed(array, vocab):
return vocab[ array.index(1) ]
恢复
if options.mode == 'test':
if not options.uid:
print("Please enter model id, use -u <model id>")
return
model_uuid = options.uid
with tf.Session() as sess_res:
saver_res = tf.train.import_meta_graph("model/"+model_uuid+"/model.ckpt.meta")
saver_res.restore(sess, "model/"+model_uuid+"/model.ckpt")
TEST_PREFIX = TEST_PREFIX.lower()
for i in range(len(TEST_PREFIX)):
out = net.run_step( embed_to_vocab(TEST_PREFIX[i], vocab) , i==0)
gen_str = TEST_PREFIX
for i in range(LEN_TEST_TEXT):
element = np.random.choice( range(len(vocab)), p=out ) # Sample character from the network according to the generated output probabilities
gen_str += vocab[element]
out = net.run_step( embed_to_vocab(vocab[element], vocab) , False )
print ('----------------Text----------------')
print (gen_str)
print ('----------------End----------------')
text_file = open("data/output.txt", "w")
text_file.write(gen_str)
text_file.close()
培训
## Initialize the network
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.InteractiveSession(config=config)
net = ModelNetwork(in_size = in_size,
lstm_size = lstm_size,
num_layers = num_layers,
out_size = out_size,
session = sess,
learning_rate = learning_rate_prop,
name = "char_rnn_network")
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())
last_time = time.time()
batch = np.zeros((batch_size, time_steps, in_size))
batch_y = np.zeros((batch_size, time_steps, in_size))
possible_batch_ids = range(data.shape[0]-time_steps-1)
if options.mode == 'train':
for i in tqdm(range(NUM_TRAIN_BATCHES)):
# Sample time_steps consecutive samples from the dataset text file
batch_id = random.sample( possible_batch_ids, batch_size )
for j in range(time_steps):
ind1 = [k+j for k in batch_id]
ind2 = [k+j+1 for k in batch_id]
batch[:, j, :] = data[ind1, :]
batch_y[:, j, :] = data[ind2, :]
cst = net.train_batch(batch, batch_y)
if (i%100) == 0:
new_time = time.time()
diff = new_time - last_time
last_time = new_time
print("batch: ",i," loss: ",cst," speed: ",(100.0/diff)," batches / s")
model_uuid = str(uuid.uuid1())
saver.save(sess, "model/"+model_uuid+"/model.ckpt")
print("Finished training model, model id: " + model_uuid)
结果
0%| | 0/500 [00:00<?, ?it/s]batch: 0 loss: 4.19432 speed: 239.7453419095813 batches / s
19%|#9 | 96/500 [00:01<00:13, 30.77it/s]batch: 100 loss: 3.9609 speed: 114.48676714604412 batches / s
38%|###8 | 192/500 [00:02<00:03, 100.71it/s]batch: 200 loss: 3.08484 speed: 116.24018792187363 batches / s
60%|###### | 300/500 [00:03<00:01, 112.94it/s]batch: 300 loss: 2.65907 speed: 112.51337575482982 batches / s
79%|#######9 | 396/500 [00:03<00:00, 114.93it/s]batch: 400 loss: 2.29085 speed: 113.07714974572966 batches / s
100%|##########| 500/500 [00:04<00:00, 104.44it/s]
Finished training model, model id: 335c3de2-37f9-11e8-a493-4ccc6abbb6f6
假设
我怀疑它训练好了,因为100k批次之后的损失大约是0.9-1。但是所有已保存的 .cpkt 文件都具有相同的大小(无论我训练模型多久)。所有这一切使我得出这样的假设,即由于某些原因它没有保存模型,而是我跟着this关于如何保存和恢复模型的博客文章。
答案 0 :(得分:0)
问题在于,当我恢复模型时,我使用网络的最后一个未经训练的图层来预测 net.run_step()函数中的下一个字符。