Question

我正在运行Multi GPU CIFAR10示例。我观察到，随着我增加示例中的GPU数量，训练所需的时间也在增加。

nvidia-smi -l 1命令显示了GPU的预期利用率和行为，但是训练模型所花费的时间越多，意外的GPU数量越多。

在运行示例之前，我不知道我是否缺少任何配置设置。我还试图在多GPU上运行MNIST，我遇到了与GPU类似的问题。基本上我试图收集多GPU的一些统计数据。

当我通过给出值for i in xrange(num_gpus):来增加GPU的数量时，我发现有更多的时间。代码有什么问题吗？

start_time = time.time()

def train（）：

with tf.device('/cpu:0'):

    x = tf.placeholder(tf.float32, [None, 784])

    W = tf.Variable(tf.zeros([784, 10]))
    b = tf.Variable(tf.zeros([10]))

    #y = tf.nn.softmax(tf.matmul(x, W) + b)

    y_ = tf.placeholder(tf.float32, [None, 10])

    global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False)

    op = tf.train.GradientDescentOptimizer(0.5)

for i in xrange(4):
        with tf.device('/gpu:%d' % i):
          with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:


                #batch_xs, batch_ys = mnist.train.next_batch(100)
                #batch_xs, batch_ys = queue.dequeue_many(100)
                y = tf.nn.softmax(tf.matmul(x, W) + b)
                #print(batch_xs)

                cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
                tower_gradient = op.compute_gradients(cross_entropy)
                tower_grads.append(tower_gradient)

grads = average_gradients(tower_grads)
apply_gradient_op = op.apply_gradients(grads, global_step=global_step)

sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
#coord = tf.train.Coordinator()
#enqueue_threads = qr.create_threads(sess, coord=coord, start=True)
tf.global_variables_initializer().run()
for _ in range(1000):
    data_batch, label_batch = mnist.train.next_batch(100)
    #data_batch, label_batch = sess.run([batch_xs,batch_ys])
    sess.run(apply_gradient_op,feed_dict={x:data_batch, y_:label_batch})
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
#coord.request_stop()
#coord.join(enqueue_threads)
sess.close()

train()
print("--- %s seconds ---" % (time.time() - start_time))

谢谢＆amp;此致

随着GPU数量的增加，多GPU CIFAR10示例需要更多时间。我使用的是8个特斯拉K80 GPU

0 个答案: