当我在tensorflow中使用multi gpu时,错误出现如下:
Traceback (most recent call last):
File "multi_gpu_train.py", line 290, in <module>
tf.app.run()
File "/usr/lib/python2.7/site-packages/tensorflow/python/platform/app.py", line 48, in run
_sys.exit(main(_sys.argv[:1] + flags_passthrough))
File "multi_gpu_train.py", line 286, in main
train()
File "multi_gpu_train.py", line 187, in train
loss = tower_loss(scope)
File "multi_gpu_train.py", line 94, in tower_loss
loss_averages_op = loss_averages.apply(losses + [total_loss])
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/moving_averages.py", line 375, in apply
colocate_with_primary=(var.op.type in ["Variable", "VariableV2"]))
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 174, in create_zeros_slot
colocate_with_primary=colocate_with_primary)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 149, in create_slot_with_initializer
dtype)
File "/usr/lib/python2.7/site-packages/tensorflow/python/training/slot_creator.py", line 66, in _create_slot_var
validate_shape=validate_shape)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 1065, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 962, in get_variable
use_resource=use_resource, custom_getter=custom_getter)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 367, in get_variable
validate_shape=validate_shape, use_resource=use_resource)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 352, in _true_getter
use_resource=use_resource)
File "/usr/lib/python2.7/site-packages/tensorflow/python/ops/variable_scope.py", line 682, in _get_single_variable
"VarScope?" % name)
ValueError: Variable tower_1/loss/xentropy_mean/avg/ does not exist, or was not created with tf.get_variable(). Did you mean to set reuse=None in VarScope?
并且主要功能如下所示,它使用tower_loss功能
tower_grads = []
for i in xrange(FLAGS.num_gpus):
with tf.device('/gpu:%d' % GPU[i]):
with tf.name_scope('%s_%d' % (TOWER_NAME, GPU[i])) as scope:
# Calculate the loss for one tower of the CIFAR model. This function
# constructs the entire CIFAR model but shares the variables across
# all towers.
loss = tower_loss(scope)
# reuse = True
# Reuse variables for the next tower.
tf.get_variable_scope().reuse_variables()
# Retain the summaries from the final tower.
summaries = tf.get_collection(tf.GraphKeys.SUMMARIES, scope)
# Calculate the gradients for the batch of data on this CIFAR tower.
grads = opt.compute_gradients(loss)
# Keep track of the gradients across all towers.
tower_grads.append(grads)
# We must calculate the mean of each gradient. Note that this is the
# synchronization point across all towers.
grads = average_gradients(tower_grads)
tower_loss功能如下所示。错误信息显示在tower_1中出现错误,并且可以使用tower_0。这意味着
中的第一次迭代for i in xrange(FLAGS.num_gpus):
成功了,我不知道为什么。
def tower_loss(scope):
"""Calculate the total loss on a single tower running the CIFAR model.
Args:
scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
Returns:
Tensor of shape [] containing the total loss for a batch of data
"""
# Get images and labels for CIFAR-10.
images, labels = load_train_data.input_pipeline(FLAGS.img_path, FLAGS.label_path, FLAGS.csv_file, FLAGS.batch_size,trainning=True)
# Build inference Graph.
vgg_net = vgg16.FCN8VGG('./../lane_seg/vgg16.npy')
vgg_net.build(images,train=True,debug=False,num_classes=load_train_data.NUM_CLASSES)
logits = vgg_net.upscore32
# Build the portion of the Graph calculating the losses. Note that we will
# assemble the total_loss using a custom function below.
labels = tf.squeeze(labels, squeeze_dims=[3])
loss_weights = [0.00588551861547, 0.500363638561, 0.493750842824]
_ = weighted_loss(logits=logits,labels=labels,num_classes=load_train_data.NUM_CLASSES,head=loss_weights)
# _ = cifar10.loss(logits, labels)
# Assemble all of the losses for the current tower only.
losses = tf.get_collection('losses', scope)
# Calculate the total loss for the current tower.
total_loss = tf.add_n(losses, name='total_loss')
# Compute the moving average of all individual losses and the total loss.
loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
loss_averages_op = loss_averages.apply(losses + [total_loss])
# Attach a scalar summary to all individual losses and the total loss; do the
# same for the averaged version of the losses.
for l in losses + [total_loss]:
# Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
# session. This helps the clarity of presentation on tensorboard.
loss_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', l.op.name)
# Name each loss as '(raw)' and name the moving average version of the loss
# as the original loss name.
tf.summary.scalar(loss_name +' (raw)', l)
tf.summary.scalar(loss_name, loss_averages.average(l))
with tf.control_dependencies([loss_averages_op]):
total_loss = tf.identity(total_loss)
return total_loss
答案 0 :(得分:1)
我找到了答案,下面的代码是旧版本,最新代码发布在https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py
更新代码,它可以成功运行!