Question

我在支持GPU的实例上运行Python 2。我正在训练LSTM并每10个周期保存一次。没有失败，VM每45分钟重启一次（就在完成50个循环之前）。这已经发生了好几天，无论是在我的家庭wifi（康卡斯特）和工作无线网络。我怀疑这个问题是谷歌的设置或笔记本电脑设置的原生问题，但我找不到任何调整它的东西。

我的问题是：有没有人遇到过这个？你是怎么解决的？

我在这里包含了我的代码，但我认为这与代码无关。它在上一个if epoch % epoch_saving_period ...块中失败了。

pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

print "Process started" 

last_checkpoint_prefix = '/tmp/pretrained.ckpt-' + str(last_epoch)

tf.reset_default_graph()

with tf.Session(graph=train_graph) as sess:
    session_config=config
    saver = tf.train.Saver(tf.global_variables())
    #tf.add_to_collection('train_op', train_op)
    # If you're loading in a saved model, use the following

    if (last_epoch > 0):
        #saver = tf.train.import_meta_graph(last_checkpoint_prefix + '.meta')
        saver.restore(sess, tf.train.latest_checkpoint('/tmp/'))
        sess.run(tf.local_variables_initializer())
    else: 
        # If you're running a fresh session, use the following
        sess.run(tf.global_variables_initializer())

    input_text = train_graph.get_tensor_by_name('input:0')
    initial_state = train_graph.get_tensor_by_name('initial_state:0')
    final_state = train_graph.get_tensor_by_name('final_state:0')
    probs = train_graph.get_tensor_by_name('probs:0')
    targets = train_graph.get_tensor_by_name('targets:0')
    lr = train_graph.get_tensor_by_name('learning_rate:0')

    #init_from_checkpoint('/tmp/pretrained.ckpt', {'input': 'input',
    #                                              'final_state': 'initial_state',
    #                                              'targets': 'targets',
    #                                              'learning_rate': 'learning_rate'})


    epochList = []
    lossList = []
    epoch_saving_period = 10
    epoch = 0

    for epoch in range(last_epoch,(last_epoch+num_epochs)):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_index, (x, y) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr * math.exp(-1 * epoch / 1000): learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)

        time_elapsed = time.time() - start_time
        print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time_elapsed = {:.3f}'.format(
            epoch + 1,
            batch_index + 1,
            len(batches),
            train_loss,
            time_elapsed
            #((num_batches * num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed),
            ))

        epochList.append(epoch)
        lossList.append(train_loss)

        # save model every 10 epochs
        if epoch % epoch_saving_period == 0:
            last_epoch = epoch - epoch_saving_period 
            #saver = tf.train.Saver()
            #saver.save(sess, save_dir)
            savePath = saver.save(sess, "/tmp/pretrained.ckpt", global_step=epoch, write_meta_graph = True)

            # Copy the file to our new bucket.
            # Full reference: https://cloud.google.com/storage/docs/gsutil/commands/cp
            !gsutil cp /tmp/checkpoint gs://{bucket_name}/
            !gsutil cp /tmp/pretrained.ckpt-{epoch}.index gs://{bucket_name}/
            !gsutil cp /tmp/pretrained.ckpt-{epoch}.meta gs://{bucket_name}/
            !gsutil cp /tmp/pretrained.ckpt-{epoch}.data-00000-of-00001 gs://{bucket_name}/

            !gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.index
            !gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.meta
            !gsutil rm gs://{bucket_name}/pretrained.ckpt-{last_epoch}.data-00000-of-00001

            print('Model Trained and Saved')

        if epoch % 5 == 0: 
            plt.plot(epochList, lossList)
            plt.title('Train Loss')
            plt.show()

Colaboratory VM每45分钟重启一次

0 个答案: