我为一个非常简单的培训编写了一个多gpu代码。但是,在1CPU + 2GPU上运行的代码比在单个CPU上运行的时间更长。甚至我将max_step设置为非常大的数据,例如1000000。
性能类似于:如果1CPU + 2GPU需要大约27s而单个CPU大约需要20s。
我想知道我的代码中的op定义是否有问题,所以它没有完全使用GPU?
以下是代码:
import tensorflow as tf
import os
import time
def gpu_inference(features,scope):
w = tf.get_variable("weights",shape=(4,1))
b = tf.get_variable("bias",shape=(1))
return tf.matmul(features,w)+b
def gpu_losses(logits,labels,scope): #all data tf.float32
labels=tf.reshape(labels,shape=(6,1))
delta = tf.square(logits-labels)
losses = tf.reduce_mean(delta)
tf.add_to_collection("losses_coll",losses) #no use??
return losses
def average_gradients(gpu_grads):
#this is just to compute mean of grads.
average_grads = []
for grad_of_gpus in zip(*gpu_grads):
grads = []
for g, _ in grad_of_gpus: # (g,_) is (grad0_gpu0, var0_gpu0)...
grads.append(g)
grad = tf.reduce_mean(grads, 0)
v = grad_of_gpus[0][1]
grad_and_var = (grad, v) #v is variable, grad is value
average_grads.append(grad_and_var)
return average_grads
#define under cpu:
# - variables (weights/bias/global_step/all_grads): w/b need to be set "reuse".
# - graph: read data into train_batch_queue()
# - graph/functions: average_grads
#define under gpus:
# - graph: read batch from queue, inference, loss, gpu's grads and put it in global val
with tf.device('/cpu:0'):
csvFiles = os.listdir('./data')
csvFiles = [i for i in csvFiles if i[-4:]=='.csv' ]
csvFiles = ['./data/'+i for i in csvFiles]
fileQ = tf.train.string_input_producer(csvFiles,shuffle=False);
reader = tf.TextLineReader()
key,value = reader.read(fileQ)
record_defaults = [[0.0], [0.0], [0.0], [0.0], [0]]
col1, col2, col3, col4, label = tf.decode_csv(value, record_defaults=record_defaults)
feature = tf.stack([col1, col2, col3, col4])
num_gpus = 2
feature_batch, label_batch = tf.train.shuffle_batch([feature, label], batch_size=6, capacity=100, min_after_dequeue=1)
# I think the queue should be on CPU and GPU uses dequeue to read batch data. Is it right?
train_batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue( [feature_batch, label_batch], capacity=2 * num_gpus )
max_step = 10000
global_step_val = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False)
weights_val = tf.get_variable('weights', shape=(4,1), dtype=tf.float32)
bias_val = tf.get_variable('bias', shape=(1), dtype=tf.float32)
#define variable and initialize in cpu:
local_init=tf.local_variables_initializer() #why need this??
global_init=tf.global_variables_initializer()
gpu_grads = []
for i in range(num_gpus):
with tf.device('/gpu:%d' % i):
with tf.name_scope('%s_%d' % ("gpu", i)) as scope: #no need?
tf.get_variable_scope().reuse_variables()
x_batch, y_batch = train_batch_queue.dequeue()
y_batch = tf.cast(y_batch,dtype=tf.float32)
inf_batch = gpu_inference(x_batch,scope)
loss = gpu_losses(inf_batch,y_batch,scope)
optimizer = tf.train.GradientDescentOptimizer(0.01)
grads = optimizer.compute_gradients(loss)
gpu_grads.append(grads)
#end with gpus
avg_grads = average_gradients(gpu_grads) # synchronization point across all towers.
train_op = optimizer.apply_gradients(avg_grads, global_step=global_step_val)
sess = tf.Session( config=tf.ConfigProto(allow_soft_placement=True) ) #log_device_placement='/cpu:0' wrong. should be Int
sess.run([local_init,global_init])
coord = tf.train.Coordinator() #generate thread coordinator.Queue runner of reading dataset needs it.
threads = tf.train.start_queue_runners(sess=sess,coord=coord)
start_time = time.time()
for step in xrange(max_step):
w = tf.get_variable("weights",shape=(4,1)) #for test
b = tf.get_variable("bias",shape=(1)) #for test
_,loss_v,w,b,a_grads,g_step = sess.run([train_op,loss,w,b,avg_grads,global_step_val])
duration = time.time() - start_time
print("**duration is: ",duration)
saver = tf.train.Saver()
save_path = saver.save(sess,"./ex3.ckpt")
print("**Model saved in file: %s" % save_path)
coord.request_stop()
coord.join(threads)
print("**END**")
#end of with device(cpu)