jobs = ["localhost:222"+str(rank) for i in range(size)] #creates the workers
cluster = tf.train.ClusterSpec({"local": jobs})
server = tf.train.Server(cluster, job_name="local", task_index=rank)
with tf.device("/job:local/task:"+str(rank)): #assigns the devices
#code to make tensors
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
optimizer=tf.train.SyncReplicasOptimizer(optimizer, replicas_to_aggregate=size, # syncs the work
optimizer = optimizer.minimize(cost,global_step=global_step)
init_op = tf.initialize_all_variables()
print "Network initialization done in core", rank
sv = tf.train.Supervisor(is_chief=(rank == 0), logdir="/tmp/train_logs",init_op=init_op,saver=None,recovery_wait_secs=100,global_step=global_step)
sess_config = tf.ConfigProto()
with sv.prepare_or_wait_for_session(server.target,config=sess_config) as sess: # all replicas frezze here and never start working
print "Started trainning on process", rank
for epoch in range(1):
_, c,g = sess.run([optimizer, cost,global_step], feed_dict={x: batch_x, y: batch_y}) #master freezes here as it attempts to aggregate results even if it is the only one working and size is set to 1. The code will not block if optimizer is not SyncReplicas
sv.stop()# never gets here`