我在支持GPU的实例上运行Python 2。我正在训练LSTM并每10个周期保存一次。没有失败,VM每45分钟重启一次(就在完成50个循环之前)。这已经发生了好几天,无论是在我的家庭wifi(康卡斯特)和工作无线网络。我怀疑这个问题是谷歌的设置或笔记本电脑设置的原生问题,但我找不到任何调整它的东西。
我的问题是:有没有人遇到过这个?你是怎么解决的?
我在这里包含了我的代码,但我认为这与代码无关。它在上一个if epoch % epoch_saving_period ...
块中失败了。
pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()
print "Process started"
last_checkpoint_prefix = '/tmp/pretrained.ckpt-' + str(last_epoch)
tf.reset_default_graph()
with tf.Session(graph=train_graph) as sess:
session_config=config
saver = tf.train.Saver(tf.global_variables())
#tf.add_to_collection('train_op', train_op)
# If you're loading in a saved model, use the following
if (last_epoch > 0):
#saver = tf.train.import_meta_graph(last_checkpoint_prefix + '.meta')
saver.restore(sess, tf.train.latest_checkpoint('/tmp/'))
sess.run(tf.local_variables_initializer())
else:
# If you're running a fresh session, use the following
sess.run(tf.global_variables_initializer())
input_text = train_graph.get_tensor_by_name('input:0')
initial_state = train_graph.get_tensor_by_name('initial_state:0')
final_state = train_graph.get_tensor_by_name('final_state:0')
probs = train_graph.get_tensor_by_name('probs:0')
targets = train_graph.get_tensor_by_name('targets:0')
lr = train_graph.get_tensor_by_name('learning_rate:0')
#init_from_checkpoint('/tmp/pretrained.ckpt', {'input': 'input',
# 'final_state': 'initial_state',
# 'targets': 'targets',
# 'learning_rate': 'learning_rate'})
epochList = []
lossList = []
epoch_saving_period = 10
epoch = 0
for epoch in range(last_epoch,(last_epoch+num_epochs)):
state = sess.run(initial_state, {input_text: batches[0][0]})
for batch_index, (x, y) in enumerate(batches):
feed_dict = {
input_text: x,
targets: y,
initial_state: state,
lr * math.exp(-1 * epoch / 1000): learning_rate
}
train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
time_elapsed = time.time() - start_time
print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f} time_elapsed = {:.3f}'.format(
epoch + 1,
batch_index + 1,
len(batches),
train_loss,
time_elapsed
#((num_batches * num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed),
))
epochList.append(epoch)
lossList.append(train_loss)
# save model every 10 epochs
if epoch % epoch_saving_period == 0:
last_epoch = epoch - epoch_saving_period
#saver = tf.train.Saver()
#saver.save(sess, save_dir)
savePath = saver.save(sess, "/tmp/pretrained.ckpt", global_step=epoch, write_meta_graph = True)
# Copy the file to our new bucket.
# Full reference: https://cloud.google.com/storage/docs/gsutil/commands/cp
!gsutil cp /tmp/checkpoint gs://{bucket_name}/
!gsutil cp /tmp/pretrained.ckpt-{epoch}.index gs://{bucket_name}/
!gsutil cp /tmp/pretrained.ckpt-{epoch}.meta gs://{bucket_name}/
!gsutil cp /tmp/pretrained.ckpt-{epoch}.data-00000-of-00001 gs://{bucket_name}/
!gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.index
!gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.meta
!gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.data-00000-of-00001
print('Model Trained and Saved')
if epoch % 5 == 0:
plt.plot(epochList, lossList)
plt.title('Train Loss')
plt.show()