Tensorflow多GPU性能不佳

时间:2017-07-18 02:34:14

标签: tensorflow

我们尝试实施Tower方法,但发现性能变差:

  1. 修改自: https://github.com/tensorflow/models/tree/master/inception

  2. 设备:

    • Intel Core i7
    • GTX-1060 x 2
  3. 源代码:

    • 拆分=无:默认版本

    • 分裂=真:塔版

  4. from tensorflow.python.ops import tensor_array_ops
    from tensorflow.python.client import device_lib
    import tensorflow as tf
    import tflib as lib
    import numpy as np
    import time
    BATCH = 64
    DIM = 1000
    GPUs = 2
    
    Splitting = True
    
    def init_matrix(shape):
      return tf.random_normal(shape, stddev=0.1)
    
    def Block(param, x, name, reuse):
      W  = tf.get_variable('%sweight'%name, [DIM, DIM])
      b  = tf.get_variable('%sbias'%name, [DIM])
      if not reuse: param.extend([W, b])
    
      x_ = tf.reshape(x, [-1,DIM])
      output = tf.nn.sigmoid(tf.matmul(x_, W) + b)
      return tf.reshape(output,[-1,DIM,DIM])
    
    def _tower_loss(param, inputs, reuse=None):
      with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
        output = Block(param, inputs, 'Layer.0.', reuse)
        output = Block(param, output, 'Layer.1.', reuse)
        output = Block(param, output, 'Layer.2.', reuse)
        output = Block(param, output, 'Layer.3.', reuse)
        output = Block(param, output, 'Layer.4.', reuse)
        output = Block(param, output, 'Layer.5.', reuse)
        output = tf.reshape(output, [-1, DIM*DIM])
        return tf.reduce_mean(output)
    
    def _all_gradients(tower_grads):
      all_grads = []
      for i in range(len(tower_grads[0])):
        for grad in tower_grads:
          grads = []
          expanded_g = tf.expand_dims(grad[i], 0)
          grads.append(expanded_g)
        grad = tf.concat(axis=0, values=grads)
        grad = tf.reduce_sum(grad,0)
        all_grads.append(grad)
      return all_grads
    
    if not Splitting:
      opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
      inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])
    
      param = []
      loss = _tower_loss(param, inputs, None)
      grad, _  = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
      apply_gradient_op = opt.apply_gradients(zip(grad, param))
      merged = tf.summary.merge_all()
    
      with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
        session.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(".", session.graph)
      
        for i in range(100):
          start = time.time()
          session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
          print 'Iter'+str(i)+': time='+str(time.time()-start)
    
    else:
      with tf.Graph().as_default(), tf.device('/cpu:0'):
        opt = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9)
     
        inputs = tf.placeholder(tf.float32, shape=[BATCH,DIM,DIM])
        inputs_splits = tf.split(axis=0, num_or_size_splits=GPUs, value=inputs)
    
        param = []
        tower_grads = []
        reuse = None
        for i in range(GPUs):
          with tf.device('/gpu:%d'%i):
            with tf.name_scope('Tower_%d'%i) as scope:
              with tf.device('/cpu:0'):
                loss = _tower_loss(param, inputs_splits[i], reuse)
              reuse = True
              grad, _  = tf.clip_by_global_norm(tf.gradients(loss, param), 5.0)
              tower_grads.append(grad)
        grads = _all_gradients(tower_grads)
        apply_gradient_op = opt.apply_gradients(zip(grads, param))
        merged = tf.summary.merge_all()
    
        with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
          session.run(tf.global_variables_initializer())
          writer = tf.summary.FileWriter(".", session.graph)
          for i in range(100):
            start = time.time()
            session.run(apply_gradient_op,feed_dict={inputs:np.zeros([BATCH,DIM,DIM])})
            print 'Iter'+str(i)+': time='+str(time.time()-start)

    1. 性能:

      • 默认版本 - 仅使用GPU:0

        时间= 0.867873907089

      • Tower版 - 尝试使用多GPU

        时间= 4.88468384743

    2. 我们的问题是:

      1. 使用Tower方法显示慢5倍。我们的实施有什么问题吗?

      2. 根据教程,我们将模型保存在CPU中,并将任务拆分为不同的GPU。但我们的GPU通过PCIe而不是NVLink相互连接。数据传输经常花费很多。有没有其他方法可以帮助基于PCIe的多GPU?

      3. 感谢。

1 个答案:

答案 0 :(得分:0)

for i in range(GPUs):
  with tf.device('/gpu:%d'%i):
    with tf.name_scope('Tower_%d'%i) as scope:
      with tf.device('/cpu:0'):  ### this line may cause all op allocated on cpu, try remove this line 
        loss = _tower_loss(param, inputs_splits[i], reuse)
      reuse = True