Question

我正在使用的是cpu机器和gpu机器。代码在cpu机器上。 tf.train.Saver在非分布式tensorflow的cpu机器上运行良好。但是，当我使用gpu机器在分布式tensorflow下运行代码时，它无法保存。

保存时，表示找不到tempstatexxxxx文件。将sharded = True添加到保护程序后，它只创建一个检查点文件和一个元文件。检查点文件中列出的文件不存在。然后恢复无法正常工作。

我该怎么办？

import tensorflow as tf
from common import tfec2

save_dir = "/tmp"

W = tf.Variable(tf.zeros([784, 10]), name="weights")
b = tf.Variable(tf.zeros([10]), name="bias")
step = tf.Variable(0)
step_length = tf.Variable(1)
saver = tf.train.Saver(tf.all_variables(), sharded=True)
ckpt = tf.train.get_checkpoint_state(save_dir)
init = tf.initialize_all_variables()


with tfec2.TFEc2() as sess: # distributed

# with tf.Session() as sess:
    print ckpt
    if ckpt:
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        res_obj = saver.restore(sess, ckpt.model_checkpoint_path)
        print res_obj
    else:
        print("Created model with fresh parameters.")
        sess.run(init)
    print step.eval(session=sess)
    # print sess.run(step)
    op_a = tf.add(step, step_length)
    op = tf.assign(step, op_a)
    sess.run(op)
    print step.eval(session=sess)
    print("Start Save")
    saver.save(sess, save_dir+"/example.ckpt", step)
    print("End Save")

一些错误记录

Traceback (most recent call last):
  File "example.py", line 78, in <module>
    saver.save(sess, save_dir+"/example.ckpt", step)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 1037, in save
    {self.saver_def.filename_tensor_name: checkpoint_file})
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
    run_metadata_ptr)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 564, in _run
    feed_dict_string, options, run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 637, in _do_run
    target_list, options, run_metadata)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 659, in _do_call
    e.code)
tensorflow.python.framework.errors.NotFoundError: /tmp/model/example.ckpt-1-00000-of-00001.tempstate3052208956008812953
     [[Node: save/save = SaveSlices[T=[DT_INT32, DT_INT32, DT_FLOAT, DT_FLOAT], _device="/job:worker/replica:0/task:0/cpu:0"](save/ShardedFilename, save/save/tensor_names, save/save/shapes_and_slices, Variable, Variable_1, bias_G209, weights_G211)]]
Caused by op u'save/save', defined at:
  File "example.py", line 34, in <module>
    saver = tf.train.Saver(tf.all_variables(), sharded=True)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 832, in __init__
    restore_sequentially=restore_sequentially)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 496, in build
    save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 218, in _AddShardedSaveOps
    sharded_saves.append(self._AddSaveOps(sharded_filename, vars_to_save))
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 197, in _AddSaveOps
    save = self.save_op(filename_tensor, vars_to_save)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 149, in save_op
    tensor_slices=[vs.slice_spec for vs in vars_to_save])
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/io_ops.py", line 172, in _save
    tensors, name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 341, in _save_slices
    name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 661, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
    self._traceback = _extract_stack()







I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:206] Initialize HostPortsGrpcChannelCache for job saver -> {localhost:2223}
I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:206] Initialize HostPortsGrpcChannelCache for job worker -> {172.31.26.237:2222}
I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:202] Started server with target: grpc://localhost:2223
 Session Start!!!!!
model_checkpoint_path: "/tmp/model.ckpt-?????-of-00001"
all_model_checkpoint_paths: "/tmp/model.ckpt-?????-of-00001"

Reading model parameters from /tmp/model.ckpt-?????-of-00001
Traceback (most recent call last):
  File "example.py", line 65, in <module>
    res_obj = saver.restore(sess, ckpt.model_checkpoint_path)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 1088, in restore
    raise ValueError("Restore called with invalid save path %s" % save_path)
ValueError: Restore called with invalid save path /tmp/model.ckpt-?????-of-00001

新更新

（工作者是gpu机器，工作节省者是凑机器）

Saver有效！

with tf.device("/job:worker"):
    # W = tf.Variable(tf.zeros([784, 10]), name="weights")
    b = tf.Variable(tf.zeros([10]), name="bias")
    step = tf.Variable(0)
    step_length = tf.Variable(1)
    op_a = tf.add(step, step_length)
    op = tf.assign(step, op_a)

    # saver = tf.train.Saver(tf.all_variables(), sharded=True)


with tf.device("/job:saver"):
    W = tf.Variable(tf.zeros([784, 10]), name="weights")
    # b = tf.Variable(tf.zeros([10]), name="bias")
    # step = tf.Variable(0)
    # step_length = tf.Variable(1)
    # op_a = tf.add(step, step_length)
    # op = tf.assign(step, op_a)

    saver = tf.train.Saver(tf.all_variables(), sharded=True)

但是如果在cpu机器上没有val，或者在gpu机器上没有saver obj，那就不行了

with tf.device("/job:worker"):
    W = tf.Variable(tf.zeros([784, 10]), name="weights")
    b = tf.Variable(tf.zeros([10]), name="bias")
    step = tf.Variable(0)
    step_length = tf.Variable(1)
    op_a = tf.add(step, step_length)
    op = tf.assign(step, op_a)

    # saver = tf.train.Saver(tf.all_variables(), sharded=True)


with tf.device("/job:saver"):
    # W = tf.Variable(tf.zeros([784, 10]), name="weights")
    # b = tf.Variable(tf.zeros([10]), name="bias")
    # step = tf.Variable(0)
    # step_length = tf.Variable(1)
    # op_a = tf.add(step, step_length)
    # op = tf.assign(step, op_a)

    saver = tf.train.Saver(tf.all_variables(), sharded=True)

或

with tf.device("/job:worker"):
    # W = tf.Variable(tf.zeros([784, 10]), name="weights")
    b = tf.Variable(tf.zeros([10]), name="bias")
    step = tf.Variable(0)
    step_length = tf.Variable(1)
    op_a = tf.add(step, step_length)
    op = tf.assign(step, op_a)

    saver = tf.train.Saver(tf.all_variables(), sharded=True)


with tf.device("/job:saver"):
    W = tf.Variable(tf.zeros([784, 10]), name="weights")
    # b = tf.Variable(tf.zeros([10]), name="bias")
    # step = tf.Variable(0)
    # step_length = tf.Variable(1)
    # op_a = tf.add(step, step_length)
    # op = tf.assign(step, op_a)

    # saver = tf.train.Saver(tf.all_variables(), sharded=True)

错误是

 Session Start!!!!!
model_checkpoint_path: "/tmp/model/example.ckpt-1-?????-of-00001"
all_model_checkpoint_paths: "/tmp/model/example.ckpt-1-?????-of-00001"

Reading model parameters from /tmp/model/example.ckpt-1-?????-of-00001
Traceback (most recent call last):
  File "example.py", line 64, in <module>
    res_obj = saver.restore(sess, ckpt.model_checkpoint_path)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 1088, in restore
    raise ValueError("Restore called with invalid save path %s" % save_path)
ValueError: Restore called with invalid save path /tmp/model/example.ckpt-1-?????-of-00001

这个example.ckpt-1 - ????? - 00001在gpu机器上，在cpu mahcine中只有checkpoint和.mate文件

分布式tensorflow保护程序错误

新更新

0 个答案: