张量流实施比火炬的速度慢{2}

时间:2016-12-18 18:49:54

标签: python tensorflow deep-learning torch

我尝试在张量流上实施stacked hourglass,而火炬实施已经存在here

我在默认配置下使用Titan X pascal进行了测试(批量大小= 6),平均训练迭代时间约为343毫秒。

我使用随机输入/输出来实现我的tensorflow实现:

import tensorflow as tf

class stacked_hourglass():
    def __init__(self, nb_stack, name='stacked_hourglass'):
        self.nb_stack = nb_stack
        self.name = name

    def __call__(self, x):
        with tf.name_scope(self.name) as scope:
            padding = tf.pad(x, [[0,0],[3,3],[3,3],[0,0]], name='padding')
            with tf.name_scope("preprocessing") as sc:
                conv1 = self._conv(padding, 64, 7, 2, 'VALID', 'conv1')
                norm1 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                r1 = self._residual_block(norm1, 128, 'r1')
                pool = tf.contrib.layers.max_pool2d(r1, [2,2], [2,2], 'VALID', scope=scope)
                r2 = self._residual_block(pool, 128, 'r2')
                r3 = self._residual_block(r2, 256, 'r3')
            hg = [None] * self.nb_stack
            ll = [None] * self.nb_stack
            ll_ = [None] * self.nb_stack
            out = [None] * self.nb_stack
            out_ = [None] * self.nb_stack
            sum_ = [None] * self.nb_stack
            with tf.name_scope('_hourglass_0_with_supervision') as sc:
                hg[0] = self._hourglass(r3, 4, 256, '_hourglass')
                ll[0] = self._conv_bn_relu(hg[0], 256, name='conv_1')
                ll_[0] = self._conv(ll[0],256,1,1,'VALID','ll')
                out[0] = self._conv(ll[0],16,1,1,'VALID','out')
                out_[0] = self._conv(out[0],256,1,1,'VALID','out_')
                sum_[0] = tf.add_n([ll_[0], out_[0], r3])
            for i in range(1, self.nb_stack - 1):
                with tf.name_scope('_hourglass_' + str(i) + '_with_supervision') as sc:
                    hg[i] = self._hourglass(sum_[i-1], 4, 256, '_hourglass')
                    ll[i] = self._conv_bn_relu(hg[i], 256, name='conv_1')
                    ll_[i] = self._conv(ll[i],256,1,1,'VALID','ll')
                    out[i] = self._conv(ll[i],16,1,1,'VALID','out')
                    out_[i] = self._conv(out[i],256,1,1,'VALID','out_')
                    sum_[i] = tf.add_n([ll_[i], out_[i], sum_[i-1]])
            with tf.name_scope('_hourglass_' + str(self.nb_stack - 1) + '_with_supervision') as sc:
                hg[self.nb_stack-1] = self._hourglass(sum_[self.nb_stack - 2], 4, 256, '_hourglass')
                ll[self.nb_stack-1] = self._conv_bn_relu(hg[self.nb_stack - 1], 256, name='conv_1')
                out[self.nb_stack-1] = self._conv(ll[self.nb_stack-1],16,1,1,'VALID','out')
            return tf.stack(out)

    def _conv(self, inputs, nb_filter, kernel_size=1, strides=1, pad='VALID', name='conv'):
        with tf.name_scope(name) as scope:
            kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
                                    kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
            conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding=pad, data_format='NHWC')
            return conv

    def _conv_bn_relu(self, inputs, nb_filter, kernel_size=1, strides=1, name=None):
         with tf.name_scope(name) as scope:
            kernel = tf.Variable(tf.contrib.layers.xavier_initializer(uniform=False)([kernel_size,\
                                    kernel_size,inputs.get_shape().as_list()[3],nb_filter]), name='weights')
            conv = tf.nn.conv2d(inputs, kernel, [1,strides,strides,1], padding='SAME', data_format='NHWC')
            norm = tf.contrib.layers.batch_norm(conv, 0.9, epsilon=1e-5, activation_fn=tf.nn.relu, scope=scope)
            return norm

    def _conv_block(self, inputs, nb_filter_out, name='_conv_block'):
        with tf.name_scope(name) as scope:
            with tf.name_scope('norm_conv1') as sc:
                norm1 = tf.contrib.layers.batch_norm(inputs, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv1 = self._conv(norm1, nb_filter_out / 2, 1, 1, 'SAME', name='conv1')
            with tf.name_scope('norm_conv2') as sc:
                norm2 = tf.contrib.layers.batch_norm(conv1, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv2 = self._conv(norm2, nb_filter_out / 2, 3, 1, 'SAME', name='conv2')
            with tf.name_scope('norm_conv3') as sc:
                norm3 = tf.contrib.layers.batch_norm(conv2, 0.9, epsilon=1e-5, 
                                    activation_fn=tf.nn.relu, scope=sc)
                conv3 = self._conv(norm3, nb_filter_out, 1, 1, 'SAME', name='conv3')
            return conv3

    def _skip_layer(self, inputs, nb_filter_out, name='_skip_layer'):
        if inputs.get_shape()[3].__eq__(tf.Dimension(nb_filter_out)):
            return inputs
        else:
            with tf.name_scope(name) as scope:
                conv = self._conv(inputs, nb_filter_out, 1, 1, 'SAME', name='conv')
                return conv

    def _residual_block(self, inputs, nb_filter_out, name='_residual_block'):
        with tf.name_scope(name) as scope:
            _conv_block = self._conv_block(inputs, nb_filter_out)
            _skip_layer = self._skip_layer(inputs, nb_filter_out)
            return tf.add(_skip_layer, _conv_block)

    def _hourglass(self, inputs, n, nb_filter_res, name='_hourglass'):
        with tf.name_scope(name) as scope:
            # Upper branch
            up1 = self._residual_block(inputs, nb_filter_res, 'up1')
            # Lower branch
            pool = tf.contrib.layers.max_pool2d(inputs, [2,2], [2,2], 'VALID', scope=scope)
            low1 = self._residual_block(pool, nb_filter_res, 'low1')
            if n > 1:
                low2 = self._hourglass(low1, n-1, nb_filter_res, 'low2')
            else:
                low2 = self._residual_block(low1, nb_filter_res, 'low2')
            low3 = self._residual_block(low2, nb_filter_res, 'low3')
            low4 = tf.image.resize_nearest_neighbor(low3, tf.shape(low3)[1:3] * 2,
                                                    name='upsampling')
            if n < 4:
                return tf.add(up1, low4, name='merge')
            else:
                return self._residual_block(tf.add(up1, low4), nb_filter_res, 'low4')

if __name__ == "__main__":
    import os
    import sys
    import numpy as np
    import time
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) 
    with tf.Graph().as_default():
        DEVICE = '/gpu:0'
        with tf.device(DEVICE):
            print "start build model..."
            _x = tf.placeholder(tf.float32, [None, 256, 256, 3])
            y = tf.placeholder(tf.float32, [8, None, 64, 64, 16])
            output = stacked_hourglass(8, 'stacked_hourglass')(_x)
            loss = tf.reduce_mean(tf.square(output - y))
            rmsprop = tf.train.RMSPropOptimizer(2.5e-4)
            print "build finished..."
        train_step = tf.Variable(0, name='global_step', trainable=False)
        with tf.device(DEVICE):
            train_rmsprop = rmsprop.minimize(loss, train_step)
        init = tf.global_variables_initializer()
        with tf.Session() as sess:
            with tf.device(DEVICE):
                sess.run(init)
            print "test..."
            xarr = np.random.rand(100, 6, 256, 256, 3)
            yarr = np.random.rand(100, 8, 6, 64, 64, 16)
            _time = time.clock()
            with tf.device(DEVICE):
                for u in range(0, 100):
                    sess.run(train_rmsprop, feed_dict={_x:xarr[u], y:yarr[u]})
            print "test:", time.clock() - _time

输出结果为:

I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcublas.so
.8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcudnn.so.
5 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcufft.so.
8.0 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcuda.so.1
 locally
I tensorflow/stream_executor/dso_loader.cc:128] successfully opened CUDA library libcurand.so
.8.0 locally
start build model...
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "NegTrain" device_type: "CPU"') 
for unknown op: NegTrain
E tensorflow/core/framework/op_kernel.cc:925] OpKernel ('op: "Skipgram" device_type: "CPU"') 
for unknown op: Skipgram
build finished...
I tensorflow/core/common_runtime/gpu/gpu_device.cc:885] Found device 0 with properties: 
name: TITAN X (Pascal)
major: 6 minor: 1 memoryClockRate (GHz) 1.531
pciBusID 0000:05:00.0
Total memory: 11.90GiB
Free memory: 11.75GiB
I tensorflow/core/common_runtime/gpu/gpu_device.cc:906] DMA: 0 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:916] 0:   Y 
I tensorflow/core/common_runtime/gpu/gpu_device.cc:975] Creating TensorFlow device (/gpu:0) -
> (device: 0, name: TITAN X (Pascal), pci bus id: 0000:05:00.0)
test...
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=2609 evicted_count=1000 eviction_rate=0.383289 and unsatisfied allocation ra
te=0.667841
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 100
 to 110
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2013 evicted_count=2000 eviction_rate=0.993542 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=4719 evicted_count=3000 eviction_rate=0.635728 and unsatisfied allocation ra
te=0.625358
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 193
 to 212
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=2025 evicted_count=2000 eviction_rate=0.987654 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1037 evicted_count=1000 eviction_rate=0.96432 and unsatisfied allocation rate=0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1054 evicted_count=1000 eviction_rate=0.948767 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 0 get reques
ts, put_count=1079 evicted_count=1000 eviction_rate=0.926784 and unsatisfied allocation rate=
0
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:247] PoolAllocator: After 4543 get req
uests, put_count=5036 evicted_count=2000 eviction_rate=0.397141 and unsatisfied allocation ra
te=0.359674
I tensorflow/core/common_runtime/gpu/pool_allocator.cc:259] Raising pool_size_limit_ from 140
0 to 1540
test: 71.733044

意味着平均迭代大约是717毫秒,这是火炬实现的两倍......

我知道Tensorflow应该会稍微慢些但是已经做了很多工作来赶上(现在假设一些benchmarks非常接近)

你知道是什么让我的实现变慢了吗?

1 个答案:

答案 0 :(得分:2)

前进步时间比较如何? TensorFlow在backprop上的历史比Torch慢,因为AD运行在具有更高粒度的图形上(单个数学运算而不是Torch图层),因此为向后传递生成了更多的运算。在某些情况下,通过添加重要操作/其渐变的融合版本可以减轻这种情况。

一些想法

  1. 确保您使用tf.fused_batch_norm封面(即fused=True参数here

  2. 使用队列代替feed_dictfeed_dict会从Python运行时到TensorFlow运行时产生额外的副本,因此您实际上正在执行2个副本 - Python-&gt; TensorFlow CPU,TensorFlow CPU-&gt; TensorFlow GPU。为了吸收CPU-&gt; GPU传输延迟的额外步骤,有this

  3. 查看timelines可以告诉您哪个部分太慢了。

  4. tcmalloc和c protobufs

    sudo apt-get install google-perftools
    export LD_PRELOAD="/usr/lib/libtcmalloc.so.4" 
    pip install --upgrade https://storage.googleapis.com/tensorflow/linux/cpu/protobuf-3.0.0-cp27-none-linux_x86_64.whl