我们尝试实施Tower方法,但发现性能变差:
修改自: https://github.com/tensorflow/models/tree/master/inception
设备:
源代码:
拆分=无:默认版本
分裂=真:塔版
from tensorflow.python.ops import tensor_array_ops
from tensorflow.python.client import device_lib
import tensorflow as tf
import tflib as lib
import numpy as np
import time
BATCH = 64
DIM = 1000
GPUs = 2
Splitting = True
def init_matrix(shape):
return tf.random_normal(shape, stddev=0.1)
def Block(param, x, name, reuse):
W = tf.get_variable('%sweight'%name, [DIM, DIM])
b = tf.get_variable('%sbias'%name, [DIM])
if not reuse: param.extend([W, b])
x_ = tf.reshape(x, [-1,DIM])
output = tf.nn.sigmoid(tf.matmul(x_, W) + b)
return tf.reshape(output,[-1,DIM,DIM])
def _tower_loss(param, inputs, reuse=None):
with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
output = Block(param, inputs, 'Layer.0.', reuse)
output = Block(param, output, 'Layer.1.', reuse)
output = Block(param, output, 'Layer.2.', reuse)
output = Block(param, output, 'Layer.3.', reuse)
output = Block(param, output, 'Layer.4.', reuse)
output = Block(param, output, 'Layer.5.', reuse)
output = tf.reshape(output, [-1, DIM*DIM])
return tf.reduce_mean(output)
def _all_gradients(tower_grads):
all_grads = []
for i in range(len(tower_grads[0])):
for grad in tower_grads:
grads = []
expanded_g = tf.expand_dims(grad[i], 0)
grads.append(expanded_g)
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_sum(grad,0)
all_grads.append(grad)
return all_grads
if not Splitting:
opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])
param = []
loss = _tower_loss(param, inputs, None)
grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
apply_gradient_op = opt.apply_gradients(zip(grad, param))
merged = tf.summary.merge_all()
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
session.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(".", session.graph)
for i in range(100):
start = time.time()
session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
print 'Iter'+str(i)+': time='+str(time.time()-start)
else:
with tf.Graph().as_default(), tf.device('/cpu:0'):
opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])
inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs)
param = []
tower_grads = []
reuse = None
for i in range(GPUs):
with tf.device('/gpu:%d'%i):
with tf.name_scope('Tower_%d'%i) as scope:
with tf.device('/cpu:0'):
loss = _tower_loss(param, inputs_splits[i], reuse)
reuse = True
grad, _ = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
tower_grads.append(grad)
grads = _all_gradients(tower_grads)
apply_gradient_op = opt.apply_gradients(zip(grads, param))
merged = tf.summary.merge_all()
with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
session.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(".", session.graph)
for i in range(100):
start = time.time()
session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
print 'Iter'+str(i)+': time='+str(time.time()-start)
性能:
默认版本 - 仅使用GPU:0
时间= 0.867873907089
Tower版 - 尝试使用多GPU
时间= 4.88468384743
我们的问题是:
使用Tower方法显示慢5倍。我们的实施有什么问题吗?
根据教程,我们将模型保存在CPU中,并将任务拆分为不同的GPU。但我们的GPU通过PCIe而不是NVLink相互连接。数据传输经常花费很多。有没有其他方法可以帮助基于PCIe的多GPU?
感谢。
答案 0 :(得分:0)
for i in range(GPUs):
with tf.device('/gpu:%d'%i):
with tf.name_scope('Tower_%d'%i) as scope:
with tf.device('/cpu:0'): ### this line may cause all op allocated on cpu, try remove this line
loss = _tower_loss(param, inputs_splits[i], reuse)
reuse = True