我有以下代码在Tensorflow中实现2层ConvNet。当我在单个GPU设置上使用它时,它完美地工作如下:
learning_rate = 0.001
row_size = self.features_size[0]
col_size = self.features_size[1]
conv1_size = self.h_layers[0]
conv2_size = self.h_layers[1]
x = tf.placeholder(tf.float32, [None, row_size, col_size])
y = tf.placeholder(tf.float32, [None, self.n_class])
weights = { 'W_conv1': tf.Variable(tf.random_normal([conv1_size[0], conv1_size[1], 1, 32])),
'W_conv2': tf.Variable(tf.random_normal([conv2_size[0], conv2_size[1], 32, 64])),
'W_fc': tf.Variable(tf.random_normal([row_size*col_size*4, 1024])),
'out': tf.Variable(tf.random_normal([1024, self.n_class]))}
biases = { 'b_conv1': tf.Variable(tf.random_normal([32])),
'b_conv2': tf.Variable(tf.random_normal([64])),
'b_fc': tf.Variable(tf.random_normal([1024])),
'out': tf.Variable(tf.random_normal([self.n_class]))}
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def maxpool2d(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
def get_available_gpus(**args):
local_device_protos = device_lib.list_local_devices()
return [x.name for x in local_device_protos if x.device_type == 'GPU']
def conv_net(x, W, B, dropout_rate, is_training):
x = tf.reshape(x, shape=[-1, x.shape[1], x.shape[2], 1])
conv1 = conv2d(x, W['W_conv1'])
conv1 = tf.nn.relu(conv1 + B['b_conv1'])
conv1 = maxpool2d(conv1)
conv2 = tf.nn.relu(conv2d(conv1, W['W_conv2']) + B['b_conv2'])
conv2 = maxpool2d(conv2)
fc1 = tf.contrib.layers.flatten(conv2)
fc1 = tf.nn.relu(tf.matmul(fc1, W['W_fc']) + B['b_fc'])
fc1 = tf.layers.dropout(fc1, rate=dropout_rate, training=is_training)
output = tf.matmul(fc1, W['out']) + B['out']
######## THIS PART IS SPECIFIC TO SINGLE-GPU #######
logits_train = conv_net(x, weights, biases, self.dropout_rate, is_training=True)
prediction = tf.argmax(logits_train, axis=1)
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits_train, labels=y))
train_step = optimizer.minimize(cross_entropy, global_step=tf.train.get_global_step())
#####################################################
global_step = tf.Variable(0, name='global_step', trainable=False)
prediction, cross_entropy, train_step, x, y = self.set_model()
sess = tf.Session()
saver = tf.train.Saver(max_to_keep=2)
fetch = tf.global_variables_initializer()
sess.run([fetch])
for epoch in range(self.epoch):
epoch_loss = 0
i = 0
while i < len(features):
start = i
end = i + self.batch_size
batch_x = np.array(features[start:end])
batch_y = np.array(labels[start:end])
_, c = sess.run([train_step, cross_entropy], feed_dict={x: batch_x, y: batch_y})
epoch_loss += c
i += self.batch_size
print('Global Step:', sess.run(global_step), ', Epoch:', epoch, ' completed out of', self.epoch, 'loss:', epoch_loss)
这在单GPU上运行良好,但我想让它与Cuda一起使用多个nvidia Tesla k80 GPU。
我根据TensorFlow文档修改了调用优化器的部分,以便:
################ MULTI-GPU MODIFICATION ############
gpu_list = get_available_gpus()
tower_grads = []
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
if gpu_list:
print(gpu_list)
for gpu in gpu_list:
with tf.device(gpu):
logits_train = conv_net(x, weights, biases, self.dropout_rate, is_training=True)
prediction = tf.argmax(logits_train, axis=1)
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits_train, labels=y))
grads = optimizer.compute_gradients(cross_entropy)#, global_step=tf.train.get_global_step())
tower_grads.append(grads)
grads = average_gradients(tower_grads)
apply_gradient_op = optimizer.apply_gradients(grads, global_step=tf.train.get_global_step())
train_step = tf.group(apply_gradient_op)
#################################################
即使这也有效,并且同时使用我所有GPU的资源,但它无法收敛(在相同问题上使用单GPU方法快速收敛的数百次迭代后,损失保持不变)。
我认为问题来自塔梯度的聚合,但由于我使用了cifar10_multi_gpu_train.py中TensorFlow提供的确切函数,还记录了here我无法想象
average_gradients()
方法应该是问题所在。无论如何,这是:
def average_gradients(tower_grads):
"""
Provided by Tensorflow doc
"""
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
任何想法收敛失败可能来自哪里?
我还在考虑这样一个事实,即我根据GPU的数量不分割批次......我还有什么输入?