Question

我们尝试实施Tower方法，但发现性能变差：

修改自： https://github.com/tensorflow/models/tree/master/inception
设备：
- Intel Core i7
- GTX-1060 x 2
源代码：
- 拆分=无：默认版本
- 分裂=真：塔版

from tensorflow.python.ops import tensor_array_ops
from tensorflow.python.client import device_lib
import tensorflow as tf
import tflib as lib
import numpy as np
import time
BATCH = 64
DIM = 1000
GPUs = 2

Splitting = True

def init_matrix(shape):
  return tf.random_normal(shape, stddev=0.1)

def Block(param, x, name, reuse):
  W  = tf.get_variable('%sweight'%name, [DIM, DIM])
  b  = tf.get_variable('%sbias'%name, [DIM])
  if not reuse: param.extend([W, b])

  x_ = tf.reshape(x, [-1,DIM])
  output = tf.nn.sigmoid(tf.matmul(x_, W) + b)
  return tf.reshape(output,[-1,DIM,DIM])

def _tower_loss(param, inputs, reuse=None):
  with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
    output = Block(param, inputs, 'Layer.0.', reuse)
    output = Block(param, output, 'Layer.1.', reuse)
    output = Block(param, output, 'Layer.2.', reuse)
    output = Block(param, output, 'Layer.3.', reuse)
    output = Block(param, output, 'Layer.4.', reuse)
    output = Block(param, output, 'Layer.5.', reuse)
    output = tf.reshape(output, [-1, DIM*DIM])
    return tf.reduce_mean(output)

def _all_gradients(tower_grads):
  all_grads = []
  for i in range(len(tower_grads[0])):
    for grad in tower_grads:
      grads = []
      expanded_g = tf.expand_dims(grad[i], 0)
      grads.append(expanded_g)
    grad = tf.concat(axis=0, values=grads)
    grad = tf.reduce_sum(grad,0)
    all_grads.append(grad)
  return all_grads

if not Splitting:
  opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
  inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])

  param = []
  loss = _tower_loss(param, inputs, None)
  grad, _  = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
  apply_gradient_op = opt.apply_gradients(zip(grad, param))
  merged = tf.summary.merge_all()

  with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
    session.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(".", session.graph)
  
    for i in range(100):
      start = time.time()
      session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
      print 'Iter'+str(i)+': time='+str(time.time()-start)

else:
  with tf.Graph().as_default(), tf.device('/cpu:0'):
    opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
 
    inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])
    inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs)

    param = []
    tower_grads = []
    reuse = None
    for i in range(GPUs):
      with tf.device('/gpu:%d'%i):
        with tf.name_scope('Tower_%d'%i) as scope:
          with tf.device('/cpu:0'):
            loss = _tower_loss(param, inputs_splits[i], reuse)
          reuse = True
          grad, _  = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
          tower_grads.append(grad)
    grads = _all_gradients(tower_grads)
    apply_gradient_op = opt.apply_gradients(zip(grads, param))
    merged = tf.summary.merge_all()

    with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
      session.run(tf.global_variables_initializer())
      writer = tf.summary.FileWriter(".", session.graph)
      for i in range(100):
        start = time.time()
        session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
        print 'Iter'+str(i)+': time='+str(time.time()-start)

性能：
- 默认版本 - 仅使用GPU：0
  
  时间= 0.867873907089
- Tower版 - 尝试使用多GPU
  
  时间= 4.88468384743

我们的问题是：

使用Tower方法显示慢5倍。我们的实施有什么问题吗？
根据教程，我们将模型保存在CPU中，并将任务拆分为不同的GPU。但我们的GPU通过PCIe而不是NVLink相互连接。数据传输经常花费很多。有没有其他方法可以帮助基于PCIe的多GPU？

感谢。

Answer 1

for i in range(GPUs):
  with tf.device('/gpu:%d'%i):
    with tf.name_scope('Tower_%d'%i) as scope:
      with tf.device('/cpu:0'):  ### this line may cause all op allocated on cpu, try remove this line 
        loss = _tower_loss(param, inputs_splits[i], reuse)
      reuse = True

Tensorflow多GPU性能不佳

1 个答案: