我在Nvidia P100 GPU上使用Tensorflow训练seq2seq模型时遇到麻烦。 这是我正在使用的版本:TensorFlow 1.10.0,Keras 2.2.2,Python 3.6.3,CUDA 9.2.148.1,cuDNN 7.2.1
我目前在训练过程中(18分钟)很好地看到OOM错误。
我一直在进行一些挖掘,并尝试设置allow_growth = True
(在下面的代码中未设置标志),但是没有设法看到任何内存增长,所有这些都是在开始时分配的。
我还尝试将图形设置为仅使用tf.finalize()
进行读取,但该程序仍在运行,这表明未添加任何节点,或者如果未将函数放置在代码中的正确位置,则表明该函数。
由于图形经过训练并且可以保存并且全部保存,因此一开始它似乎并不太大。
以下是我正在使用的一些超级参数:
batch_size = 10
rnn_size = 1024
src / tgt_vocab_size = 8000
纪元= 20
display_steps = 10
使用的数据集是docstrings和相关的功能代码,因此不会太长。 我最初的想法之一是,由于句子的大小是动态的,所以一个很长的句子可能太大了。但是我重新整理了数据集,以查看崩溃是否在不同的时间发生,并且仍然在18分钟的相同参数下发生。
这是包含图形和训练/测试循环的代码。
def train(...):
...
...
def source_generator():
for el in train_source_sents:
yield el
def target_generator():
for el in train_target_sents:
yield el
train_graph = tf.Graph()
with train_graph.as_default():
# Dataset and batch preparation
with tf.name_scope("Dataset-prep"):
source_dataset = tf.data.Dataset.from_generator(source_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = tf.data.Dataset.from_generator(target_generator,output_types= tf.int32,output_shapes=(tf.TensorShape([None]))) # Converting sentence array to dataset
target_dataset = target_dataset.map(lambda x: tf.concat([x, [target_tok2ID['<EOS>']]], 0)) # Adding <EOS> character to the endo of all the target sentences
target_input_dataset = target_dataset.map(lambda x: tf.concat([[target_tok2ID['<GO>']], x], 0)) # Creating training inputs for the decoder, This requires adding <GO> to the start of the sequence
target_sequence_length = target_dataset.map(lambda x: tf.shape(x)[0]) # Adding the sizes of all the sequences to be paired up with the reset of the dataset
dataset = tf.data.Dataset.zip((source_dataset, target_dataset, target_input_dataset, target_sequence_length)) # create the collection of all the individual datasets
dataset = dataset.shuffle(buffer_size=10000)
with tf.name_scope("Dataset-batching"):
dataset = dataset.repeat(epochs)
pad_id = target_tok2ID['<PAD>']
batched_dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None],[None], []), padding_values=(pad_id,pad_id,pad_id,pad_id))
batched_dataset = batched_dataset.prefetch(buffer_size=batch_size) # could be removed, perhaps yeilds improvements
iterator = batched_dataset.make_one_shot_iterator()
source_batch, target_batch, target_input_batch, batch_target_sequence_length = iterator.get_next()
with tf.name_scope("Encoding-layer"):
source_vocab_size = len(source_tok2ID)
embed = tf.contrib.layers.embed_sequence(source_batch, vocab_size=source_vocab_size, embed_dim=encoding_embedding_size)
stacked_cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.LSTMCell(rnn_size), keep_prob) for _ in range(num_layers)])
outputs, encoder_state = tf.nn.dynamic_rnn(stacked_cells, embed, dtype=tf.float32)
with tf.name_scope("Decoding-layer"):
target_vocab_size = len(target_tok2ID)
dec_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
dec_embed_input = tf.nn.embedding_lookup(dec_embeddings, target_input_batch)
cells = tf.contrib.rnn.MultiRNNCell([tf.contrib.rnn.LSTMCell(rnn_size) for _ in range(num_layers)])
output_layer = tf.layers.Dense(target_vocab_size)
dec_cell = tf.contrib.rnn.DropoutWrapper(cells, output_keep_prob=keep_prob)
helper = tf.contrib.seq2seq.TrainingHelper(dec_embed_input, batch_target_sequence_length)
decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, helper, encoder_state, output_layer)
max_target_sequence_length = tf.reduce_max(batch_target_sequence_length, axis=0)
dec_outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder, impute_finished=True, maximum_iterations=max_target_sequence_length)
dec_outputs = tf.identity(dec_outputs.rnn_output, name='logits') # This step might seem a little misterious, but it is to take the output from the dynamic decoder and pass it to a tensor from (Dodumentation is scarce here)
masks = tf.sequence_mask(batch_target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='masks')
cost = tf.contrib.seq2seq.sequence_loss( dec_outputs, target_batch, masks)
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
tf.summary.scalar('cost', cost)
merged = tf.summary.merge_all()
train_saver = tf.train.Saver()
init_op = tf.global_variables_initializer()
train_sess = tf.Session(graph=train_graph)
if(model_path):
print(model_path)
train_saver.restore(train_sess, model_path)
else:
train_sess.run(init_op)
store_path = "./visualisations/"
writer = tf.summary.FileWriter(store_path, train_sess.graph)
step = 0
start_time = datetime.now()
while True:
try:
step += 1
model_cost, _, summary = train_sess.run((cost, train_op, merged))
writer.add_summary(summary,step)
if(step % display_step == 0):
save_path = train_saver.save(train_sess, "./checkpoints/NMT")
print("Model saved in path: %s" % save_path)
test_sess = tf.Session(graph=test_graph)
test_saver.restore(test_sess, save_path)
# print(test_sess.run(foo))
scores = []
while True:
try:
predictions, refrence = test_sess.run([dec_predictions, test_target_batch])
for i in range(len(refrence)):
BLEU_score = nltk.translate.bleu_score.sentence_bleu([np.trim_zeros(refrence[i])], np.trim_zeros(predictions[i]), weights = (1,0))
scores.append(BLEU_score)
print("########################################")
print("ref:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(refrence[i]))))
print("")
print("pred:", list(map(lambda x: target_ID2tok[x], np.trim_zeros(predictions[i]))))
except tf.errors.OutOfRangeError:
print("Exhausted test data")
break
delta_time = datetime.now() - start_time
total_exp_time = delta_time * (total_steps / step)
remaining_time = total_exp_time - delta_time
print("")
print("Test set BLEU Score:", np.mean(scores))
print("Model cost:" ,model_cost)
print("Step {} from {}".format(step, total_steps))
print("Current time:", datetime.now())
print("Total Experiment time (Hours:Minutes:Seconds):", str(total_exp_time))
print("Time Elapled (Hours:Minutes:Seconds):", str(delta_time))
print("Time remaining (Hours:Minutes:Seconds):", str(remaining_time))
print("")
except tf.errors.OutOfRangeError:
print("Model finished training")
break
return save_path
以下是训练运行的输出: command line output
执行图形的方式是否存在问题,还是重复执行导致内存填满的步骤?
感谢您的所有帮助!