我正在运行Multi GPU CIFAR10示例。我观察到,随着我增加示例中的GPU数量,训练所需的时间也在增加。
nvidia-smi -l 1命令显示了GPU的预期利用率和行为,但是训练模型所花费的时间越多,意外的GPU数量越多。
在运行示例之前,我不知道我是否缺少任何配置设置。 我还试图在多GPU上运行MNIST,我遇到了与GPU类似的问题。基本上我试图收集多GPU的一些统计数据。
当我通过给出值for i in xrange(num_gpus):
来增加GPU的数量时,我发现有更多的时间。代码有什么问题吗?
start_time = time.time()
def train():
with tf.device('/cpu:0'):
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
#y = tf.nn.softmax(tf.matmul(x, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0), trainable=False)
op = tf.train.GradientDescentOptimizer(0.5)
for i in xrange(4):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
#batch_xs, batch_ys = mnist.train.next_batch(100)
#batch_xs, batch_ys = queue.dequeue_many(100)
y = tf.nn.softmax(tf.matmul(x, W) + b)
#print(batch_xs)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
tower_gradient = op.compute_gradients(cross_entropy)
tower_grads.append(tower_gradient)
grads = average_gradients(tower_grads)
apply_gradient_op = op.apply_gradients(grads, global_step=global_step)
sess = tf.InteractiveSession(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True))
#coord = tf.train.Coordinator()
#enqueue_threads = qr.create_threads(sess, coord=coord, start=True)
tf.global_variables_initializer().run()
for _ in range(1000):
data_batch, label_batch = mnist.train.next_batch(100)
#data_batch, label_batch = sess.run([batch_xs,batch_ys])
sess.run(apply_gradient_op,feed_dict={x:data_batch, y_:label_batch})
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
#coord.request_stop()
#coord.join(enqueue_threads)
sess.close()
train()
print("--- %s seconds ---" % (time.time() - start_time))
谢谢&此致