我有一个训练批量标准(BN)层的网络。我的批量大小是16,因此,我必须使用多个GPU。我遵循了inceptionv3的例子,可以概括为
with tf.Graph().as_default(), tf.device('/cpu:0'):
images_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=images)
labels_splits = tf.split(axis=0, num_or_size_splits=FLAGS.num_gpus, value=labels)
for i in range(FLAGS.num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (inception.TOWER_NAME, i)) as scope:
...
# Reuse variables for the next tower.
batchnorm_updates = tf.get_collection(slim.ops.UPDATE_OPS_COLLECTION,
scope)
grads = opt.compute_gradients(loss)
tower_grads.append(grads)
grads = _average_gradients(tower_grads)
apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
variable_averages = tf.train.ExponentialMovingAverage(
inception.MOVING_AVERAGE_DECAY, global_step)
variables_to_average = (tf.trainable_variables() +
tf.moving_average_variables())
variables_averages_op = variable_averages.apply(variables_to_average)
batchnorm_updates_op = tf.group(*batchnorm_updates)
train_op = tf.group(apply_gradient_op, variables_averages_op,
batchnorm_updates_op)
不幸的是,当我使用标准BN tf.contrib.layers.batch_norm
时,它使用了瘦BN图层库def _batch_norm(self, x, name, is_training, activation_fn, trainable=False):
with tf.variable_scope(name+'/BatchNorm') as scope:
o = tf.contrib.layers.batch_norm(
x,
scale=True,
activation_fn=activation_fn,
is_training=is_training,
trainable=trainable,
scope=scope)
return o
为了收集moving_mean和moving_variance,我使用了tf.GraphKeys.UPDATE_OPS
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
self.train_op = tf.group(train_op_conv, train_op_fc)
最后,在多个GPU中使用BN的想法可以从inceptionv3借用
split_image_batch = tf.split(self.image_batch, self.conf.num_gpus, 0)
split_label_batch = tf.split(self.label_batch, self.conf.num_gpus, 0)
global_step = tf.train.get_or_create_global_step()
opt= tf.train.MomentumOptimizer(self.learning_rate, self.conf.momentum)
tower_grads_encoder = []
tower_grads_decoder = []
update_ops=[]
with tf.variable_scope(tf.get_variable_scope()):
for i in range(self.conf.num_gpus):
with tf.device('/gpu:%d' % i):
net = Resnet(split_image_batch[i], self.conf.num_classes) #Build BN layer
# Loss function
self.reduced_loss = tf.reduce_mean(loss) + tf.add_n(l2_losses)
# Reuse variables for the next GPU.
tf.get_variable_scope().reuse_variables()
update_ops.extend)tf.get_collection(tf.GraphKeys.UPDATE_OPS))
# Compute grads
grads_encoder = opt.compute_gradients(self.reduced_loss, var_list=encoder_trainable)
grads_decoder = opt.compute_gradients(self.reduced_loss, var_list=decoder_trainable)
tower_grads_encoder.append(grads_encoder)
tower_grads_decoder.append(grads_decoder)
grads_encoder = self._average_gradients(tower_grads_encoder)
grads_decoder = self._average_gradients(tower_grads_decoder)
# Update params
train_op_conv = opt.apply_gradients(grads_encoder, global_step=global_step)
train_op_fc = opt.apply_gradients(grads_decoder,global_step=global_step)
variable_averages = tf.train.ExponentialMovingAverage(self.conf.MOVING_AVERAGE_DECAY, global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
with tf.control_dependencies(update_ops):
self.train_op = tf.group(train_op_conv, train_op_fc, variables_averages_op)
虽然代码运行没有错误,但性能非常低。看起来我没有正确收集BN参数。你能看一下我的代码并给我一些方向来训练多个GPU中的BN吗?感谢
答案 0 :(得分:0)
我怀疑性能问题与您每步执行多个变量更新(每个塔中的每个批次规范)有关。
您是否有理由需要从每个GPU获得批量规范更新?我们建议只使用单个塔中的统计数据来更新批量规范,除非您的分区存在偏差(这将导致其他问题),它应该是相同的。
如果您将批量规范更新限制为来自单个塔的更新,则可以将变量更新减少num_gpus
倍。