Question

我为一个非常简单的培训编写了一个多gpu代码。但是，在1CPU + 2GPU上运行的代码比在单个CPU上运行的时间更长。甚至我将max_step设置为非常大的数据，例如1000000。

性能类似于：如果1CPU + 2GPU需要大约27s而单个CPU大约需要20s。

我想知道我的代码中的op定义是否有问题，所以它没有完全使用GPU？

以下是代码：

import tensorflow as tf
import os
import time

def gpu_inference(features,scope):
    w = tf.get_variable("weights",shape=(4,1))
    b = tf.get_variable("bias",shape=(1))
    return tf.matmul(features,w)+b

def gpu_losses(logits,labels,scope):  #all data tf.float32
    labels=tf.reshape(labels,shape=(6,1)) 
    delta = tf.square(logits-labels)
    losses = tf.reduce_mean(delta)

    tf.add_to_collection("losses_coll",losses) #no use??
    return losses

def average_gradients(gpu_grads):
    #this is just to compute mean of grads.
    average_grads = []
    for grad_of_gpus in zip(*gpu_grads):
        grads = []
        for g, _ in grad_of_gpus:   # (g,_) is (grad0_gpu0, var0_gpu0)...
            grads.append(g)

        grad = tf.reduce_mean(grads, 0)
        v = grad_of_gpus[0][1]
        grad_and_var = (grad, v)  #v is variable, grad is value
        average_grads.append(grad_and_var)
    return average_grads


#define under cpu:
#   - variables (weights/bias/global_step/all_grads): w/b need to be set "reuse".
#   - graph: read data into train_batch_queue()
#   - graph/functions: average_grads
#define under gpus: 
#   - graph: read batch from queue, inference, loss, gpu's grads and put it in global val
with tf.device('/cpu:0'):    

    csvFiles = os.listdir('./data')
    csvFiles = [i for i in csvFiles if i[-4:]=='.csv' ]
    csvFiles = ['./data/'+i for i in csvFiles]

    fileQ = tf.train.string_input_producer(csvFiles,shuffle=False);
    reader = tf.TextLineReader()
    key,value = reader.read(fileQ)
    record_defaults = [[0.0], [0.0], [0.0], [0.0], [0]]
    col1, col2, col3, col4, label = tf.decode_csv(value, record_defaults=record_defaults)
    feature = tf.stack([col1, col2, col3, col4])

    num_gpus = 2
    feature_batch, label_batch = tf.train.shuffle_batch([feature, label], batch_size=6, capacity=100, min_after_dequeue=1) 
    # I think the queue should be on CPU and GPU uses dequeue to read batch data. Is it right?
    train_batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [feature_batch, label_batch], capacity=2 * num_gpus )


    max_step = 10000
    global_step_val = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
    weights_val = tf.get_variable('weights', shape=(4,1), dtype=tf.float32)
    bias_val = tf.get_variable('bias', shape=(1), dtype=tf.float32)

    #define variable and initialize in cpu:
    local_init=tf.local_variables_initializer()  #why need this??
    global_init=tf.global_variables_initializer() 

    gpu_grads = []
    for i in range(num_gpus):
        with tf.device('/gpu:%d' % i): 
            with tf.name_scope('%s_%d' % ("gpu", i)) as scope: #no need?
                tf.get_variable_scope().reuse_variables()
                x_batch, y_batch = train_batch_queue.dequeue()
                y_batch = tf.cast(y_batch,dtype=tf.float32)
                inf_batch = gpu_inference(x_batch,scope)  
                loss = gpu_losses(inf_batch,y_batch,scope)  

                optimizer = tf.train.GradientDescentOptimizer(0.01)
                grads = optimizer.compute_gradients(loss)
                gpu_grads.append(grads)
        #end with gpus

    avg_grads = average_gradients(gpu_grads) # synchronization point across all towers.
    train_op = optimizer.apply_gradients(avg_grads, global_step=global_step_val)

    sess = tf.Session( config=tf.ConfigProto(allow_soft_placement=True) ) #log_device_placement='/cpu:0' wrong. should be Int
    sess.run([local_init,global_init])

    coord = tf.train.Coordinator()  #generate thread coordinator.Queue runner of reading dataset needs it.
    threads = tf.train.start_queue_runners(sess=sess,coord=coord)

    start_time = time.time()
    for step in xrange(max_step):
        w = tf.get_variable("weights",shape=(4,1))  #for test
        b = tf.get_variable("bias",shape=(1))  #for test
        _,loss_v,w,b,a_grads,g_step = sess.run([train_op,loss,w,b,avg_grads,global_step_val])

    duration = time.time() - start_time
    print("**duration is: ",duration)

    saver = tf.train.Saver()
    save_path = saver.save(sess,"./ex3.ckpt")
    print("**Model saved in file: %s" % save_path)

    coord.request_stop()
    coord.join(threads)

    print("**END**")
    #end of with device(cpu)

Tensorflow：在我的代码中，多个GPU的性能比单CPU差

0 个答案: