我一直很难开始分布式tensorflow会话。我一直在使用MPI来启动作业,并且运行版本使用mpi来聚合工作,但代码在纯粹的张量流中更简单,任何人都可以帮我弄清楚我做错了什么。
jobs = ["localhost:222"+str(rank) for i in range(size)] #creates the workers
cluster = tf.train.ClusterSpec({"local": jobs})
server = tf.train.Server(cluster, job_name="local", task_index=rank)
with tf.device("/job:local/task:"+str(rank)): #assigns the devices
#code to make tensors
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
optimizer=tf.train.SyncReplicasOptimizer(optimizer, replicas_to_aggregate=size, # syncs the work
total_num_replicas=size,
replica_id=rank,
name="Fact")
optimizer = optimizer.minimize(cost,global_step=global_step)
init_op = tf.initialize_all_variables()
print "Network initialization done in core", rank
sv = tf.train.Supervisor(is_chief=(rank == 0), logdir="/tmp/train_logs",init_op=init_op,saver=None,recovery_wait_secs=100,global_step=global_step)
sess_config = tf.ConfigProto()
with sv.prepare_or_wait_for_session(server.target,config=sess_config) as sess: # all replicas frezze here and never start working
print "Started trainning on process", rank
for epoch in range(1):
_, c,g = sess.run([optimizer, cost,global_step], feed_dict={x: batch_x, y: batch_y}) #master freezes here as it attempts to aggregate results even if it is the only one working and size is set to 1. The code will not block if optimizer is not SyncReplicas
sv.stop()# never gets here`