我使用Model_Average_optimizer
并发生了一些问题:
代码更有可能用作model_average_optimizer_test.py
我发现在init ModelAverageOptimizer
中发生此错误以创建local_step。而这个问题更可能是device_setter的一些问题
但我不知道为什么。有人可以帮助我吗?
我是否写过自定义代码:是
操作系统平台和分发:Linux n10-044-067 4.4.0-33.bm.1-amd64#1 SMP星期四,2017年6月22日11:19:55 +0800 x86_64 GNU / Linux
TensorFlow安装自:Anaconda2.5,pip install tensorflow_gpu
TensorFlow版本:1.8.0
Bazel版本:N / A
CUDA / cuDNN版本:CUDA-9.0,cuDNN-7.1.2(也安装在anaconda2.5中)
GPU型号和内存:GeForce GTX 1080
准确的命令重现:
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=worker --task_id=0
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=worker --task_id=1
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=server --task_id=0
您可能会注意到,当一个工作人员初始化类ModelAverageOptimizer
追踪(最近一次呼叫最后一次):
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 112, in tf.app.run()
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 109, in main
test()
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 84, in test
interval_steps=3)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/contrib/opt/python/training/model_average_optimizer.py", line 139, in init name="local_step")
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1317, in get_variable
constraint=constraint)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1079, in get_variable
constraint=constraint)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 417, in get_variable
return custom_getter(**custom_getter_kwargs)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/contrib/opt/python/training/model_average_optimizer.py", line 92, in call
return getter(name, trainable, collections, *args, **kwargs)
TypeError: _true_getter() got multiple values for keyword argument 'dtype'
代码如下:
import os
import time
import json
import copy
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.contrib.opt.python.training import model_average_optimizer
flags = tf.flags
flags.DEFINE_string("server_hosts", "", "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("worker_hosts", "", "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("job_name", "", "Either 'server' of 'worker'")
flags.DEFINE_integer("task_id", 0, "Task Id for Each workers")
FLAGS = flags.FLAGS
tf.logging.set_verbosity(tf.logging.INFO)
def workers_ps_creator(args):
ps_hosts = args.server_hosts.split(",")
worker_hosts = args.worker_hosts.split(",")
num_workers = len(worker_hosts)
cluster = tf.train.ClusterSpec({"ps": ps_hosts,"worker": worker_hosts})
gpu_options = tf.GPUOptions(allocator_type='BFC', allow_growth=True)
if args.job_name == "server":
server_def = tf.train.ServerDef(cluster=cluster.as_cluster_def(),
job_name='ps',
task_index=args.task_id,
default_session_config=tf.ConfigProto(gpu_options=gpu_options, device_count={"GPU":0}),
protocol="grpc")
elif args.job_name == "worker":
server_def = tf.train.ServerDef(cluster=cluster.as_cluster_def(),
job_name="worker",
task_index=args.task_id,
default_session_config = tf.ConfigProto(gpu_options=gpu_options),
protocol="grpc")
server = tf.train.Server(server_def)
return server, cluster, num_workers, gpu_options
def Model(opt):
if FLAGS.task_id == 0:
var_0 = tf.get_variable(initializer = 0.0, name = 'v0')
var_1 = tf.get_variable(initializer = 1.0, name = 'v1')
grads_0 = constant_op.constant(-1.0)
grads_1 = constant_op.constant(-1.0)
else:
var_0 = tf.get_variable(initializer = 7.0, name = 'v0')
var_1 = tf.get_variable(initializer = 8.0, name = 'v1')
grads_0 = constant_op.constant(-2.0)
grads_1 = constant_op.constant(-2.0)
train_op = opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]],
global_step = tf.train.get_or_create_global_step())
return train_op
def test():
server, cluster, num_workers, gpu_options = workers_ps_creator(FLAGS)
if FLAGS.job_name == "server":
server.join()
elif FLAGS.job_name == "worker":
is_chief = (FLAGS.task_id == 0)
#Between-graph replication
worker_device = "/job:worker/task:%d" % (FLAGS.task_id)
ma_custom = model_average_optimizer.ModelAverageCustomGetter(worker_device=worker_device)
from tensorflow.python.training import device_setter
with tf.device(
device_setter.replica_device_setter(
cluster=cluster,
worker_device=worker_device,
ps_device="/job:ps")), \
tf.variable_scope("", custom_getter=ma_custom):
#create model
lr = tf.Variable(1, trainable=False)
opt = tf.train.GradientDescentOptimizer(lr)
sync_opt = model_average_optimizer.ModelAverageOptimizer(
opt=opt,
num_worker=num_workers,
ma_custom_getter=ma_custom,
is_chief=is_chief,
interval_steps=3)
tf.logging.info('model start')
train_model = Model(sync_opt)
tf.logging.info('model end')
ma_hook = sync_opt.make_session_run_hook()
sess_config = tf.ConfigProto(gpu_options=gpu_options)
sess_config.log_device_placement = False
sess_config.allow_soft_placement = True
all_hooks = [ma_hook]
tf.logging.info('Start Sess')
with tf.train.MonitoredTrainingSession(master=server.target,
is_chief=is_chief,
hooks=all_hooks) as sess:
tf.logging.info("is chief: %s, len: %s", is_chief, num_workers)
for i in range(4):
sess.run(train_op)
pp1 = sess.run(tf.get_default_graph().get_tensor_by_name('v0:0'))
pp2 = sess.run(tf.get_default_graph().get_tensor_by_name('v1:0'))
tf.logging.info("%d %.2f %.2f" % (FLAGS.task_id, pp1, pp2))
sv.stop()
tf.logging.info("done")