我正在使用的是cpu机器和gpu机器。代码在cpu机器上。 tf.train.Saver在非分布式tensorflow的cpu机器上运行良好。但是,当我使用gpu机器在分布式tensorflow下运行代码时,它无法保存。
保存时,表示找不到tempstatexxxxx文件。将sharded = True添加到保护程序后,它只创建一个检查点文件和一个元文件。检查点文件中列出的文件不存在。然后恢复无法正常工作。
我该怎么办?
import tensorflow as tf
from common import tfec2
save_dir = "/tmp"
W = tf.Variable(tf.zeros([784, 10]), name="weights")
b = tf.Variable(tf.zeros([10]), name="bias")
step = tf.Variable(0)
step_length = tf.Variable(1)
saver = tf.train.Saver(tf.all_variables(), sharded=True)
ckpt = tf.train.get_checkpoint_state(save_dir)
init = tf.initialize_all_variables()
with tfec2.TFEc2() as sess: # distributed
# with tf.Session() as sess:
print ckpt
if ckpt:
print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
res_obj = saver.restore(sess, ckpt.model_checkpoint_path)
print res_obj
else:
print("Created model with fresh parameters.")
sess.run(init)
print step.eval(session=sess)
# print sess.run(step)
op_a = tf.add(step, step_length)
op = tf.assign(step, op_a)
sess.run(op)
print step.eval(session=sess)
print("Start Save")
saver.save(sess, save_dir+"/example.ckpt", step)
print("End Save")
一些错误记录
Traceback (most recent call last):
File "example.py", line 78, in <module>
saver.save(sess, save_dir+"/example.ckpt", step)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 1037, in save
{self.saver_def.filename_tensor_name: checkpoint_file})
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 340, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 564, in _run
feed_dict_string, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 637, in _do_run
target_list, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 659, in _do_call
e.code)
tensorflow.python.framework.errors.NotFoundError: /tmp/model/example.ckpt-1-00000-of-00001.tempstate3052208956008812953
[[Node: save/save = SaveSlices[T=[DT_INT32, DT_INT32, DT_FLOAT, DT_FLOAT], _device="/job:worker/replica:0/task:0/cpu:0"](save/ShardedFilename, save/save/tensor_names, save/save/shapes_and_slices, Variable, Variable_1, bias_G209, weights_G211)]]
Caused by op u'save/save', defined at:
File "example.py", line 34, in <module>
saver = tf.train.Saver(tf.all_variables(), sharded=True)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 832, in __init__
restore_sequentially=restore_sequentially)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 496, in build
save_tensor = self._AddShardedSaveOps(filename_tensor, per_device)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 218, in _AddShardedSaveOps
sharded_saves.append(self._AddSaveOps(sharded_filename, vars_to_save))
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 197, in _AddSaveOps
save = self.save_op(filename_tensor, vars_to_save)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 149, in save_op
tensor_slices=[vs.slice_spec for vs in vars_to_save])
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/io_ops.py", line 172, in _save
tensors, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_io_ops.py", line 341, in _save_slices
name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 661, in apply_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
self._traceback = _extract_stack()
I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:206] Initialize HostPortsGrpcChannelCache for job saver -> {localhost:2223}
I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:206] Initialize HostPortsGrpcChannelCache for job worker -> {172.31.26.237:2222}
I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:202] Started server with target: grpc://localhost:2223
Session Start!!!!!
model_checkpoint_path: "/tmp/model.ckpt-?????-of-00001"
all_model_checkpoint_paths: "/tmp/model.ckpt-?????-of-00001"
Reading model parameters from /tmp/model.ckpt-?????-of-00001
Traceback (most recent call last):
File "example.py", line 65, in <module>
res_obj = saver.restore(sess, ckpt.model_checkpoint_path)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 1088, in restore
raise ValueError("Restore called with invalid save path %s" % save_path)
ValueError: Restore called with invalid save path /tmp/model.ckpt-?????-of-00001
(工作者是gpu机器,工作节省者是凑机器)
Saver有效!
with tf.device("/job:worker"):
# W = tf.Variable(tf.zeros([784, 10]), name="weights")
b = tf.Variable(tf.zeros([10]), name="bias")
step = tf.Variable(0)
step_length = tf.Variable(1)
op_a = tf.add(step, step_length)
op = tf.assign(step, op_a)
# saver = tf.train.Saver(tf.all_variables(), sharded=True)
with tf.device("/job:saver"):
W = tf.Variable(tf.zeros([784, 10]), name="weights")
# b = tf.Variable(tf.zeros([10]), name="bias")
# step = tf.Variable(0)
# step_length = tf.Variable(1)
# op_a = tf.add(step, step_length)
# op = tf.assign(step, op_a)
saver = tf.train.Saver(tf.all_variables(), sharded=True)
但是如果在cpu机器上没有val,或者在gpu机器上没有saver obj,那就不行了
with tf.device("/job:worker"):
W = tf.Variable(tf.zeros([784, 10]), name="weights")
b = tf.Variable(tf.zeros([10]), name="bias")
step = tf.Variable(0)
step_length = tf.Variable(1)
op_a = tf.add(step, step_length)
op = tf.assign(step, op_a)
# saver = tf.train.Saver(tf.all_variables(), sharded=True)
with tf.device("/job:saver"):
# W = tf.Variable(tf.zeros([784, 10]), name="weights")
# b = tf.Variable(tf.zeros([10]), name="bias")
# step = tf.Variable(0)
# step_length = tf.Variable(1)
# op_a = tf.add(step, step_length)
# op = tf.assign(step, op_a)
saver = tf.train.Saver(tf.all_variables(), sharded=True)
或
with tf.device("/job:worker"):
# W = tf.Variable(tf.zeros([784, 10]), name="weights")
b = tf.Variable(tf.zeros([10]), name="bias")
step = tf.Variable(0)
step_length = tf.Variable(1)
op_a = tf.add(step, step_length)
op = tf.assign(step, op_a)
saver = tf.train.Saver(tf.all_variables(), sharded=True)
with tf.device("/job:saver"):
W = tf.Variable(tf.zeros([784, 10]), name="weights")
# b = tf.Variable(tf.zeros([10]), name="bias")
# step = tf.Variable(0)
# step_length = tf.Variable(1)
# op_a = tf.add(step, step_length)
# op = tf.assign(step, op_a)
# saver = tf.train.Saver(tf.all_variables(), sharded=True)
错误是
Session Start!!!!!
model_checkpoint_path: "/tmp/model/example.ckpt-1-?????-of-00001"
all_model_checkpoint_paths: "/tmp/model/example.ckpt-1-?????-of-00001"
Reading model parameters from /tmp/model/example.ckpt-1-?????-of-00001
Traceback (most recent call last):
File "example.py", line 64, in <module>
res_obj = saver.restore(sess, ckpt.model_checkpoint_path)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/saver.py", line 1088, in restore
raise ValueError("Restore called with invalid save path %s" % save_path)
ValueError: Restore called with invalid save path /tmp/model/example.ckpt-1-?????-of-00001
这个example.ckpt-1 - ????? - 00001在gpu机器上,在cpu mahcine中只有checkpoint和.mate文件