我有以下代码(如下)根据这里的示例执行tensorflow同步梯度下降: https://github.com/tensorflow/models/blob/master/inception/inception/inception_distributed_train.py




(python2 SyncSGD.py --job_name='ps' --task_index=0 &) && (python2 SyncSGD.py --job_name='worker' --task_index=0 &) && (python2 SyncSGD.py --job_name='worker' --task_index=1 &) && (python2 SyncSGD.py --job_name='worker' --task_index==2 &)






from __future__ import print_function

import tensorflow as tf
from tensorflow.python.platform import app
import sys
import time

# input flags
tf.app.flags.DEFINE_string("job_name", "", "Either 'ps' or 'worker'")
tf.app.flags.DEFINE_integer("task_index", 0, "Index of task within the job")
FLAGS = tf.app.flags.FLAGS

class SynchSGD:

  def __init__(self,parameter_servers,workers ):
    self.cluster = tf.train.ClusterSpec({"ps":parameter_servers, "worker":workers})
    # start a server for a specific task
    self.server = tf.train.Server(self.cluster,

  def run(self,fetches,fetches_format,dataset,batch_size=1,test_dataset=None,learning_rate=0.001,test_fetches_format=None,training_epochs=20, logs_path='/tmp/mnist/1'):

    if FLAGS.job_name == "ps":
    elif FLAGS.job_name == "worker":

      # Between-graph replication
      with tf.device(tf.train.replica_device_setter(
        worker_device="/job:worker/task:%d/cpu:0" % (FLAGS.task_index),#FLAGS.task_index),

        print(str(FLAGS.task_index),"b4 gloabl step")
        # count the number of updates
        global_step = tf.get_variable('global_step', [],
                                    initializer = tf.constant_initializer(0),
                                    trainable = False)
        print(str(FLAGS.task_index),"b4 fetches")
        if FLAGS.task_index == 0:
            chief_queue_runner = fetches[-1]
            init_token_op = fetches[-2]
        init_op = tf.initialize_all_variables()

      print(str(FLAGS.task_index),"b4 sv")
      sv = tf.train.Supervisor(is_chief=(FLAGS.task_index == 0),

      begin_time = time.time()
      frequency = 100
      print(str(FLAGS.task_index),"b4 sesh")
      with sv.prepare_or_wait_for_session(self.server.target) as sess:
             # is chief
        print(str(FLAGS.task_index),"b4 qr")
        queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
        sv.start_queue_runners(sess, queue_runners)
        print(str(FLAGS.task_index)," after qr")
        if FLAGS.task_index == 0:
          sv.start_queue_runners(sess, [chief_queue_runner])
        print(str(FLAGS.task_index)," after cqr")

        if 'summary' in fetches_format:
          # create log writer object (this will log on every machine)
          writer = tf.train.SummaryWriter(logs_path, graph=tf.get_default_graph())

        # perform training cycles
        start_time = time.time()
        for epoch in range(training_epochs):

          # number of batches in one epoch
          batch_count = int(dataset.num_examples/batch_size)

          count = 0
          for i in range(batch_count):
            batch_x, batch_y = dataset.next_batch(batch_size)
            # perform the operations we defined earlier on batch
            result = sess.run(
                            feed_dict={inputs[0]: batch_x, inputs[1]: batch_y})
            if 'summary' in fetches_format:
              writer.add_summary(result[fetches_format['summary']], result[fetches_format['step']])

            count += 1
            if count % frequency == 0 or i+1 == batch_count:
              elapsed_time = time.time() - start_time
              start_time = time.time()
              count = 0


def main(argv=None):
  # cluster specification
  parameter_servers = ["localhost:2222"]
  workers = [ "localhost:2223",

  # config
  batch_size = 100
  learning_rate = 0.001
  training_epochs = 3
  logs_path = "/rscratch/cs194/psharing-neural-nets/sync-logging"

  #create variables for model
  def fetches(learning_rate, global_step):
    # input images
    with tf.name_scope('input'):
      # None -> batch size can be any size, 784 -> flattened mnist image
      x = tf.placeholder(tf.float32, shape=[None, 784], name="x-input")
      # target 10 output classes
      y_ = tf.placeholder(tf.float32, shape=[None, 10], name="y-input")

    # model parameters will change during training so we use tf.Variable
    with tf.name_scope("weights"):
      W1 = tf.Variable(tf.random_normal([784, 100]))
      W2 = tf.Variable(tf.random_normal([100, 10]))

    # bias
    with tf.name_scope("biases"):
      b1 = tf.Variable(tf.zeros([100]))
      b2 = tf.Variable(tf.zeros([10]))

    # implement model
    with tf.name_scope("softmax"):
      # y is our prediction
      z2 = tf.add(tf.matmul(x,W1),b1)
      a2 = tf.nn.sigmoid(z2)
      z3 = tf.add(tf.matmul(a2,W2),b2)
      y  = tf.nn.softmax(z3)

    # specify cost function
    with tf.name_scope('cross_entropy'):
      # this is our cost
      cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))

    # specify optimizer
    with tf.name_scope('train'):
      # optimizer is an "operation" which we can execute in a session
      grad_op = tf.train.GradientDescentOptimizer(learning_rate)
      print('len workers %d: '%FLAGS.task_index,str(len(workers)))
      rep_op = tf.train.SyncReplicasOptimizer(grad_op,
      grads = rep_op.compute_gradients(cross_entropy)
      apply_gradients_op = rep_op.apply_gradients(grads,global_step=global_step)
      with tf.control_dependencies([apply_gradients_op]):
      #train_op = rep_op.minimize(cross_entropy, global_step=global_step)
      #train_op = grad_op.minimize(cross_entropy, global_step=global_step)

    init_token_op = rep_op.get_init_tokens_op()
    chief_queue_runner = rep_op.get_chief_queue_runner()

    with tf.name_scope('Accuracy'):
      # accuracy
      correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
      accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    # create a summary for our cost and accuracy
    tf.scalar_summary("cost", cross_entropy)
    tf.scalar_summary("accuracy", accuracy)
    # merge all summaries into a single "operation" which we can execute in a session
    summary_op = tf.merge_all_summaries()
    return [x,y_],[train_op, cross_entropy, summary_op, global_step, accuracy, init_token_op, chief_queue_runner]

  # load mnist data set
  from tensorflow.examples.tutorials.mnist import input_data
  dataset = input_data.read_data_sets('MNIST_data', one_hot=True).train
  test_dataset = input_data.read_data_sets('MNIST_data', one_hot=True).test

  fetches_format = {'train':0,'cost':1,'summary':2,'step':3}
  test_fetches_format = {'accuracy':0}

  sgd.run(fetches,fetches_format,dataset,batch_size=batch_size,learning_rate=learning_rate,test_dataset=test_dataset,training_epochs=training_epochs, logs_path=logs_path)

if __name__=="__main__":


出于某种原因,它似乎正在创造一堆其他工作(看起来像是与其他工作进行沟通的工作) 在8个K40上生成以下内存分配(python2作业)

| Processes:                                                       GPU Memory | 
|  GPU       PID  Type  Process name                               Usage      |                                                                                                                                                             
|    0     26489    C   ./train-imagenet-sgd                          8598MiB |                                                                                                                                                             
|    0     28733    C   python2                                        389MiB |                                                                                                                                                             
|    0     28735    C   python2                                        123MiB |                                                                                                                                                             
|    0     28737    C   python2                                        106MiB |                                                                                                                                                             
|    0     28739    C   python2                                       2976MiB |                                                                                                                                                             
|    1     26489    C   ./train-imagenet-sgd                          8598MiB |                                                                                                                                                             
|    1     28733    C   python2                                        206MiB |                                                                                                                                                             
|    1     28735    C   python2                                        306MiB |                                                                                                                                                             
|    1     28737    C   python2                                        106MiB |                                                                                                                                                             
|    1     28739    C   python2                                       2976MiB |                                                                                                                                                             
|    2     26489    C   ./train-imagenet-sgd                          8598MiB |                                                                                                                                                             
|    2     28733    C   python2                                        206MiB |                                                                                                                                                             
|    2     28735    C   python2                                        306MiB |                                                                                                                                                             
|    2     28737    C   python2                                        106MiB |                                                                                                                                                             
|    2     28739    C   python2                                       2974MiB |                                                                                                                                                             
|    3     26489    C   ./train-imagenet-sgd                          8598MiB |                                                                                                                                                             
|    3     28733    C   python2                                        206MiB |                                                                                                                                                             
|    3     28735    C   python2                                        306MiB |                                                                                                                                                             
|    3     28737    C   python2                                        106MiB |                                                                                                                                                             
|    3     28739    C   python2                                       2974MiB |                                                                                                                                                             
|    4     26489    C   ./train-imagenet-sgd                          8598MiB |                                                                                                                                                             
|    4     28733    C   python2                                        206MiB |                                                                                                                                                             
|    4     28735    C   python2                                        286MiB |                                                                                                                                                             
|    4     28737    C   python2                                        106MiB |                                                                                                                                                             
|    4     28739    C   python2                                       2976MiB |                                                                                                                                                             
|    5     26489    C   ./train-imagenet-sgd                          8598MiB |                                                                                                                                                             
|    5     28733    C   python2                                       2976MiB |                                                                                                                                                             
|    5     28735    C   python2                                        306MiB |                                                                                                                                                             
|    5     28737    C   python2                                        106MiB |                                                                                                                                                             
|    5     28739    C   python2                                        206MiB |                                                                                                                                                             
|    6     26489    C   ./train-imagenet-sgd                          8704MiB |                                                                                                                                                             
|    6     28733    C   python2                                        206MiB |                                                                                                                                                             
|    6     28735    C   python2                                        106MiB |                                                                                                                                                             
|    6     28737    C   python2                                        106MiB |                                                                                                                                                             
|    6     28739    C   python2                                        106MiB |                                                                                                                                                             
|    7     26489    C   ./train-imagenet-sgd                          8598MiB |                                                                                                                                                             
|    7     28733    C   python2                                        107MiB |                                                                                                                                                             
|    7     28735    C   python2                                        106MiB |                                                                                                                                                             
|    7     28737    C   python2                                        106MiB |                                                                                                                                                             
|    7     28739    C   python2                                        106MiB |                                                                                                                                                             


E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.80G (3009019904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.52G (2708117760 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.27G (2437305856 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 2.04G (2193575168 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.84G (1974217728 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.65G (1776795904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.49G (1599116288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.34G (1439204608 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.21G (1295284224 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1.08G (1165755904 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 1000.58M (1049180416 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 900.52M (944262400 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 810.47M (849836288 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 729.42M (764852736 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 656.48M (688367616 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 590.83M (619531008 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 531.75M (557577984 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 478.57M (501820160 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY
E tensorflow/stream_executor/cuda/cuda_driver.cc:965] failed to allocate 430.72M (451638272 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY


