Question

我正在使用官方代码测试分布式tensorflow，我有两个服务器，一个K80 GPU和一个M40 GPU，但是在集群上运行代码，只有指定的工作人员＆＃34; is_chief = True＆＃ 34;可以使用GPU进行计算，而另一个只是使用CPU，即使我已经在其上指定了GPU设备。我的部分代码如下：

def main(_):
worker_hosts = ['ipaddress0:2222', 'ipaddress1:2222']
cluster = tf.train.ClusterSpec({"worker": worker_hosts})
server = tf.train.Server(cluster, job_name='worker', task_index=FLAGS.task_index)
if FLAGS.job_name == "ps":
    server.join()
elif FLAGS.job_name == "worker":
    mnist=input_data.read_data_sets('data', one_hot=True)
    # Assigns ops to the local worker by default.
    with tf.device(tf.train.replica_device_setter(worker_device="/job:worker/task:%d/gpu:0" % (FLAGS.task_index), cluster=cluster)):
        print('building model...')
        global_step = tf.contrib.framework.get_or_create_global_step()
        train_step, cross_entropy, accuracy, y, x, y_, keep_prob=build_model(global_step)
        init=tf.global_variables_initializer()

    hooks=[tf.train.StopAtStepHook(last_step=1000000)]
    with tf.train.MonitoredTrainingSession(master=server.target, 
        is_chief=(FLAGS.task_index == 0), 
        checkpoint_dir="/tmp/train_logs", 
        hooks=hooks,
        ) as mon_sess:
        mon_sess.run(init)
        i=0
        while not mon_sess.should_stop() and i<=20000:
            i+=1
            batch = mnist.train.next_batch(50)
            if i%50 == 0:
                train_accuracy= mon_sess.run(accuracy, feed_dict={x:batch[0], y_: batch[1], keep_prob: 1.0})
                print ("step %d, training accuracy %g"%(mon_sess.run(global_step), train_accuracy))
            mon_sess.run(train_step, feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
        print ("test accuracy %g"%mon_sess.run(accuracy, feed_dict={x:batch[0], y_: batch[1], keep_prob: 1.0}))

ipaddress0与task_index 0的服务器可以使用K80 GPU进行计算，另一个使用CPU。有谁知道如何解决这个问题？请提供一些帮助，谢谢

GPU不适用于分布式tensorflow集群

0 个答案: