Tensorflow:Model_Average_optimizer在获取关键字参数'dtype'

时间:2018-05-29 12:04:02

标签: tensorflow

我使用Model_Average_optimizer并发生了一些问题:
代码更有可能用作model_average_optimizer_test.py 我发现在init ModelAverageOptimizer中发生此错误以创建local_step。而这个问题更可能是device_setter的一些问题 但我不知道为什么。有人可以帮助我吗?

我是否写过自定义代码:是
操作系统平台和分发:Linux n10-044-067 4.4.0-33.bm.1-amd64#1 SMP星期四,2017年6月22日11:19:55 +0800 x86_64 GNU / Linux
TensorFlow安装自:Anaconda2.5,pip install tensorflow_gpu
TensorFlow版本:1.8.0
Bazel版本:N / A
CUDA / cuDNN版本:CUDA-9.0,cuDNN-7.1.2(也安装在anaconda2.5中)
GPU型号和内存:GeForce GTX 1080
准确的命令重现:

~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=worker --task_id=0
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=worker --task_id=1
~/anaconda2/bin/python test.py --server_hosts=localhost:12222 --worker_hosts=localhost:12223,localhost:12224 --job_name=server --task_id=0

您可能会注意到,当一个工作人员初始化类ModelAverageOptimizer

时,会出现以下错误

追踪(最近一次呼叫最后一次):

File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 112, in tf.app.run()
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 109, in main
test()
File "/data00/home/wupeihao/ma_test/src/rnnlm.py", line 84, in test
interval_steps=3)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/contrib/opt/python/training/model_average_optimizer.py", line 139, in init name="local_step")
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1317, in get_variable
constraint=constraint)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1079, in get_variable
constraint=constraint)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 417, in get_variable
return custom_getter(**custom_getter_kwargs)
File "/data00/home/wupeihao/anaconda2/lib/python2.7/site-packages/tensorflow/contrib/opt/python/training/model_average_optimizer.py", line 92, in call
return getter(name, trainable, collections, *args, **kwargs)
TypeError: _true_getter() got multiple values for keyword argument 'dtype'

代码如下:

import os
import time
import json
import copy
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.contrib.opt.python.training import model_average_optimizer

flags = tf.flags
flags.DEFINE_string("server_hosts", "", "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("worker_hosts", "", "Comma-separated list of hostname:port pairs")
flags.DEFINE_string("job_name", "", "Either 'server' of 'worker'")
flags.DEFINE_integer("task_id", 0, "Task Id for Each workers")

FLAGS = flags.FLAGS
tf.logging.set_verbosity(tf.logging.INFO)

def workers_ps_creator(args):
    ps_hosts = args.server_hosts.split(",")
    worker_hosts = args.worker_hosts.split(",")
    num_workers = len(worker_hosts)

    cluster = tf.train.ClusterSpec({"ps": ps_hosts,"worker": worker_hosts})
    gpu_options = tf.GPUOptions(allocator_type='BFC', allow_growth=True)
    if args.job_name == "server":
        server_def = tf.train.ServerDef(cluster=cluster.as_cluster_def(),
            job_name='ps',
            task_index=args.task_id,
            default_session_config=tf.ConfigProto(gpu_options=gpu_options, device_count={"GPU":0}),
            protocol="grpc")
    elif args.job_name == "worker":
        server_def = tf.train.ServerDef(cluster=cluster.as_cluster_def(),
                job_name="worker",
                task_index=args.task_id,
                default_session_config = tf.ConfigProto(gpu_options=gpu_options),
                protocol="grpc")
    server = tf.train.Server(server_def)
    return server, cluster, num_workers, gpu_options

def Model(opt):
    if FLAGS.task_id == 0:
        var_0 = tf.get_variable(initializer = 0.0, name = 'v0')
        var_1 = tf.get_variable(initializer = 1.0, name = 'v1')
        grads_0 = constant_op.constant(-1.0)
        grads_1 = constant_op.constant(-1.0)
    else:
        var_0 = tf.get_variable(initializer = 7.0, name = 'v0')
        var_1 = tf.get_variable(initializer = 8.0, name = 'v1')
        grads_0 = constant_op.constant(-2.0)
        grads_1 = constant_op.constant(-2.0)
    train_op = opt.apply_gradients([[grads_0, var_0], [grads_1, var_1]],
            global_step = tf.train.get_or_create_global_step())
    return train_op

def test():
    server, cluster, num_workers, gpu_options = workers_ps_creator(FLAGS)
    if FLAGS.job_name == "server":
        server.join()
    elif FLAGS.job_name == "worker":
        is_chief = (FLAGS.task_id == 0)
        #Between-graph replication
        worker_device = "/job:worker/task:%d" % (FLAGS.task_id)
        ma_custom = model_average_optimizer.ModelAverageCustomGetter(worker_device=worker_device)
        from tensorflow.python.training import device_setter
        with tf.device(
            device_setter.replica_device_setter(
                cluster=cluster,
                worker_device=worker_device,
                ps_device="/job:ps")), \
            tf.variable_scope("", custom_getter=ma_custom):
            #create model
            lr = tf.Variable(1, trainable=False)
            opt = tf.train.GradientDescentOptimizer(lr)
            sync_opt = model_average_optimizer.ModelAverageOptimizer(
                        opt=opt,
                        num_worker=num_workers,
                        ma_custom_getter=ma_custom,
                        is_chief=is_chief,
                        interval_steps=3)
            tf.logging.info('model start')
            train_model = Model(sync_opt)
            tf.logging.info('model end')
            ma_hook = sync_opt.make_session_run_hook()
        sess_config = tf.ConfigProto(gpu_options=gpu_options)
        sess_config.log_device_placement = False
        sess_config.allow_soft_placement = True

        all_hooks = [ma_hook]

        tf.logging.info('Start Sess')
        with tf.train.MonitoredTrainingSession(master=server.target,
                is_chief=is_chief,
                hooks=all_hooks) as sess:
            tf.logging.info("is chief: %s, len: %s", is_chief, num_workers)
            for i in range(4):
                sess.run(train_op)
                pp1 = sess.run(tf.get_default_graph().get_tensor_by_name('v0:0'))
                pp2 = sess.run(tf.get_default_graph().get_tensor_by_name('v1:0'))
                tf.logging.info("%d %.2f %.2f" % (FLAGS.task_id, pp1, pp2))
        sv.stop()
        tf.logging.info("done")

0 个答案:

没有答案