Question

当我在tensorflow中使用multi gpu时，错误出现如下：

    Traceback (most recent call last):
  File "multi_gpu_train.py", line 290, in <module>
    tf.app.run()
  File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
    _sys.exit(main(_sys.argv[:1] + flags_passthrough))
  File "multi_gpu_train.py", line 286, in main
    train()
  File "multi_gpu_train.py", line 187, in train
    loss = tower_loss(scope)
  File "multi_gpu_train.py", line 94, in tower_loss
    loss_averages_op = loss_averages.apply(losses + [total_loss])
  File "/usr/lib/python2.7/site-packages/tensorflow/python/training/moving_averages.py", line 375, in apply
    colocate_with_primary=(var.op.type in ["Variable", "VariableV2"]))
  File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 174, in create_zeros_slot
    colocate_with_primary=colocate_with_primary)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 149, in create_slot_with_initializer
    dtype)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 66, in _create_slot_var
    validate_shape=validate_shape)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable
    use_resource=use_resource, custom_getter=custom_getter)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable
    use_resource=use_resource, custom_getter=custom_getter)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable
    validate_shape=validate_shape, use_resource=use_resource)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter
    use_resource=use_resource)
  File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 682, in _get_single_variable
    "VarScope?" % name)
ValueError: Variable tower_1/loss/xentropy_mean/avg/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?

并且主要功能如下所示，它使用tower_loss功能

 tower_grads = []
    for i in xrange(FLAGS.num_gpus):
      with tf.device('/gpu:%d' % GPU[i]):
        with tf.name_scope('%s_%d' % (TOWER_NAME, GPU[i])) as scope:
          # Calculate the loss for one tower of the CIFAR model. This function
          # constructs the entire CIFAR model but shares the variables across
          # all towers.

          loss = tower_loss(scope)
          # reuse = True

          # Reuse variables for the next tower.
          tf.get_variable_scope().reuse_variables()

          # Retain the summaries from the final tower.
          summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)

          # Calculate the gradients for the batch of data on this CIFAR tower.
          grads = opt.compute_gradients(loss)

          # Keep track of the gradients across all towers.
          tower_grads.append(grads)

    # We must calculate the mean of each gradient. Note that this is the
    # synchronization point across all towers.
    grads = average_gradients(tower_grads)

tower_loss功能如下所示。错误信息显示在tower_1中出现错误，并且可以使用tower_0。这意味着

中的第一次迭代

for i in xrange(FLAGS.num_gpus):

成功了，我不知道为什么。

def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = load_train_data.input_pipeline(FLAGS.img_path, FLAGS.label_path, FLAGS.csv_file, FLAGS.batch_size,trainning=True)
  # Build inference Graph.
  vgg_net = vgg16.FCN8VGG('./../lane_seg/vgg16.npy')
  vgg_net.build(images,train=True,debug=False,num_classes=load_train_data.NUM_CLASSES) 

  logits = vgg_net.upscore32

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  labels = tf.squeeze(labels, squeeze_dims=[3])
  loss_weights = [0.00588551861547, 0.500363638561, 0.493750842824]
  _ = weighted_loss(logits=logits,labels=labels,num_classes=load_train_data.NUM_CLASSES,head=loss_weights)
  # _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')

  loss_averages_op = loss_averages.apply(losses + [total_loss])

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.summary.scalar(loss_name +' (raw)', l)
    tf.summary.scalar(loss_name, loss_averages.average(l))

  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)

  return total_loss

Answer 1

我找到了答案，下面的代码是旧版本，最新代码发布在https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py

更新代码，它可以成功运行！

tensorflow multi gpu tower错误：loss = tower_loss（范围）。 ValueError：变量tower_1 / loss / xentropy_mean / avg /不存在

1 个答案: