张量流向前传递和向后传递之间的大暂停

时间:2017-03-30 05:52:34

标签: python tensorflow deep-learning

要分析CNN模型,我正在使用CUPTI分析工具,如下所示:

import os
os.environ
['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import numpy as np
import tensorflow as tf
from scipy import misc
import time
from tensorflow.python.client import timeline
from tensorflow.contrib import layers
from tensorflow.contrib.framework import arg_scope, arg_scoped_arguments
DATA_FORMAT = 'NCHW'
TRAIN_BATCH_SIZE = 64
TLU_THRESHOLD = 3.
SRM_Kernels = np.random.rand(5,5,1,30).astype('float32')
class Model():
    def _build_model(self, inputs, is_training, data_format, name='HolyGraal'):
        self.name = name
        self.inputs = inputs
        if data_format == 'NCHW':
            _inputs = tf.transpose(inputs, [0, 3, 1, 2])
        else:
            _inputs = inputs
        self.L = []
        with arg_scope([layers.avg_pool2d], \
                kernel_size=[3,3], stride=[2,2], padding='VALID', \
                data_format=data_format):
            with tf.variable_scope('SRM_preprocess'):
                W_SRM = tf.get_variable('W', initializer=SRM_Kernels, \
                            dtype=tf.float32, \
                            regularizer=layers.l2_regularizer(5e-4))
                b = tf.get_variable('b', shape=[30], dtype=tf.float32, \
                            initializer=tf.constant_initializer(0.2))
                self.L.append(tf.nn.bias_add( \
                        tf.nn.conv2d(tf.cast(_inputs, tf.float32), \
                        W_SRM, [1,1,1,1], 'VALID', \
                        data_format=data_format), b, \
                        data_format=data_format, name='Layer1'))
                self.L.append(tf.clip_by_value(self.L[-1], \
                              -TLU_THRESHOLD, TLU_THRESHOLD, name='TLU'))
            with tf.variable_scope('ConvNetwork'):
                with arg_scope([layers.conv2d], num_outputs=30, \
                        kernel_size=3, stride=1, padding='VALID', \
                        data_format=data_format, activation_fn=tf.nn.relu, \
                        weights_initializer=layers.xavier_initializer_conv2d(), \
                        weights_regularizer=layers.l2_regularizer(5e-4), \
                        biases_initializer=tf.constant_initializer(0.2), \
                        biases_regularizer=None):
                    self.L.append(layers.conv2d(self.L[-1], \
                                  scope='Layer2'))
                    self.L.append(layers.conv2d(self.L[-1], \
                                  scope='Layer3'))
                    self.L.append(layers.conv2d(self.L[-1], \
                                  scope='Layer4'))
                    self.L.append(layers.avg_pool2d(self.L[-1], \
                                  kernel_size=[2,2], scope='Stride1'))
                    with arg_scope([layers.conv2d], kernel_size=5, \
                                   num_outputs=32):
                        self.L.append(layers.conv2d(self.L[-1], \
                                      scope='Layer5'))
                        self.L.append(layers.avg_pool2d(self.L[-1], \
                                      scope='Stride2'))
                        self.L.append(layers.conv2d(self.L[-1], \
                                      scope='Layer6'))
                        self.L.append(layers.avg_pool2d(self.L[-1], \
                                      scope='Stride3'))
                        self.L.append(layers.conv2d(self.L[-1], \
                                      scope='Layer7'))
                    self.L.append(layers.avg_pool2d(self.L[-1], \
                                  scope='Stride4'))
                    self.L.append(layers.conv2d(self.L[-1], \
                                  num_outputs=16, scope='Layer8'))
                    self.L.append(layers.conv2d(self.L[-1], \
                                  num_outputs=16, stride=3, scope='Layer9'))
                self.L.append(layers.flatten(self.L[-1]))
                self.L.append(layers.fully_connected(self.L[-1], num_outputs=2, \
                        activation_fn=None, normalizer_fn=None, \
                        weights_initializer=tf.random_normal_initializer(mean=0., stddev=0.01), \
                        biases_initializer=tf.constant_initializer(0.), scope='ip'))
        self.outputs = self.L[-1]
        return self.outputs

    def _build_losses(self, labels):
        with tf.variable_scope('loss'):
            oh = tf.one_hot(labels, 2)
            self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( \
                                                    labels=oh, \
                                                    logits=self.outputs))
        with tf.variable_scope('accuracy'):
            am = tf.argmax(self.outputs, 1)
            equal = tf.equal(am, labels)
            self.accuracy = tf.reduce_mean(tf.cast(equal, tf.float32))
        return self.loss, self.accuracy

    def _build_summaries(self):
        loss_summary = tf.summary.scalar('train_loss', self.loss)
        return loss_summary

is_training = tf.get_variable('is_training', dtype=tf.bool, shape=[], \
                          initializer=tf.constant_initializer(True), \
                          trainable=False)
img_batch = tf.placeholder(shape=[None, 256, 256, 1], dtype='uint8')
label_batch = tf.placeholder(shape=[None], dtype='uint8')
img_val = np.random.randint(0,256,(21,64,256,256,1))
label_val = np.random.randint(0,2,(21,64))
model = Model()
outputs = model._build_model(img_batch, is_training, DATA_FORMAT)
loss, accuracy = model._build_losses(tf.cast(label_batch, tf.int64))

sgd = tf.train.AdamOptimizer()
global_step = tf.get_variable('global_step', dtype=tf.int32, shape=[], \
                              initializer=tf.constant_initializer(0))
train_op = sgd.minimize(loss, global_step)
init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10000)

with tf.Session() as sess:
    sess.run(init_op)
    _time = time.time()
    _time = time.time()
    # warm-up
    for i in range(0, 20):
        sess.run([loss, train_op], feed_dict={img_batch:img_val[i], \
                                              label_batch:label_val[i]})
    print time.time() - _time
    # timeline generation
    run_metadata = tf.RunMetadata()
    _, l = sess.run([loss, train_op], feed_dict={img_batch:img_val[0],
                        label_batch:label_val[0]}, \
                        options=tf.RunOptions( \
                        trace_level=tf.RunOptions.FULL_TRACE), \
                        run_metadata=run_metadata)
    trace = timeline.Timeline(step_stats=run_metadata.step_stats)
    with open('timeline-placeholder.ctf.json', 'w') as trace_file:
        trace_file.write(trace.generate_chrome_trace_format())

使用'chrome:// tracing /'这里是我的时间表: timeline 正如你所看到的那样,前进和后退之间存在巨大的(至少70%的时间)暂停,并且时间线上没有显示任何操作,这似乎是一个很大的浪费。

它来自哪里?有没有办法避免它? (我正在尝试使用队列,但没有帮助)

1 个答案:

答案 0 :(得分:0)

我遇到了类似的问题。我发现你必须在gpu:0上运行分析代码才能获得正确的时间线。在gpu上运行:1或2或3将在跟踪显示中错过很多时间块。

希望它有所帮助。