要分析CNN模型,我正在使用CUPTI分析工具,如下所示:
import os
os.environ
['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import numpy as np
import tensorflow as tf
from scipy import misc
import time
from tensorflow.python.client import timeline
from tensorflow.contrib import layers
from tensorflow.contrib.framework import arg_scope, arg_scoped_arguments
DATA_FORMAT = 'NCHW'
TRAIN_BATCH_SIZE = 64
TLU_THRESHOLD = 3.
SRM_Kernels = np.random.rand(5,5,1,30).astype('float32')
class Model():
def _build_model(self, inputs, is_training, data_format, name='HolyGraal'):
self.name = name
self.inputs = inputs
if data_format == 'NCHW':
_inputs = tf.transpose(inputs, [0, 3, 1, 2])
else:
_inputs = inputs
self.L = []
with arg_scope([layers.avg_pool2d], \
kernel_size=[3,3], stride=[2,2], padding='VALID', \
data_format=data_format):
with tf.variable_scope('SRM_preprocess'):
W_SRM = tf.get_variable('W', initializer=SRM_Kernels, \
dtype=tf.float32, \
regularizer=layers.l2_regularizer(5e-4))
b = tf.get_variable('b', shape=[30], dtype=tf.float32, \
initializer=tf.constant_initializer(0.2))
self.L.append(tf.nn.bias_add( \
tf.nn.conv2d(tf.cast(_inputs, tf.float32), \
W_SRM, [1,1,1,1], 'VALID', \
data_format=data_format), b, \
data_format=data_format, name='Layer1'))
self.L.append(tf.clip_by_value(self.L[-1], \
-TLU_THRESHOLD, TLU_THRESHOLD, name='TLU'))
with tf.variable_scope('ConvNetwork'):
with arg_scope([layers.conv2d], num_outputs=30, \
kernel_size=3, stride=1, padding='VALID', \
data_format=data_format, activation_fn=tf.nn.relu, \
weights_initializer=layers.xavier_initializer_conv2d(), \
weights_regularizer=layers.l2_regularizer(5e-4), \
biases_initializer=tf.constant_initializer(0.2), \
biases_regularizer=None):
self.L.append(layers.conv2d(self.L[-1], \
scope='Layer2'))
self.L.append(layers.conv2d(self.L[-1], \
scope='Layer3'))
self.L.append(layers.conv2d(self.L[-1], \
scope='Layer4'))
self.L.append(layers.avg_pool2d(self.L[-1], \
kernel_size=[2,2], scope='Stride1'))
with arg_scope([layers.conv2d], kernel_size=5, \
num_outputs=32):
self.L.append(layers.conv2d(self.L[-1], \
scope='Layer5'))
self.L.append(layers.avg_pool2d(self.L[-1], \
scope='Stride2'))
self.L.append(layers.conv2d(self.L[-1], \
scope='Layer6'))
self.L.append(layers.avg_pool2d(self.L[-1], \
scope='Stride3'))
self.L.append(layers.conv2d(self.L[-1], \
scope='Layer7'))
self.L.append(layers.avg_pool2d(self.L[-1], \
scope='Stride4'))
self.L.append(layers.conv2d(self.L[-1], \
num_outputs=16, scope='Layer8'))
self.L.append(layers.conv2d(self.L[-1], \
num_outputs=16, stride=3, scope='Layer9'))
self.L.append(layers.flatten(self.L[-1]))
self.L.append(layers.fully_connected(self.L[-1], num_outputs=2, \
activation_fn=None, normalizer_fn=None, \
weights_initializer=tf.random_normal_initializer(mean=0., stddev=0.01), \
biases_initializer=tf.constant_initializer(0.), scope='ip'))
self.outputs = self.L[-1]
return self.outputs
def _build_losses(self, labels):
with tf.variable_scope('loss'):
oh = tf.one_hot(labels, 2)
self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( \
labels=oh, \
logits=self.outputs))
with tf.variable_scope('accuracy'):
am = tf.argmax(self.outputs, 1)
equal = tf.equal(am, labels)
self.accuracy = tf.reduce_mean(tf.cast(equal, tf.float32))
return self.loss, self.accuracy
def _build_summaries(self):
loss_summary = tf.summary.scalar('train_loss', self.loss)
return loss_summary
is_training = tf.get_variable('is_training', dtype=tf.bool, shape=[], \
initializer=tf.constant_initializer(True), \
trainable=False)
img_batch = tf.placeholder(shape=[None, 256, 256, 1], dtype='uint8')
label_batch = tf.placeholder(shape=[None], dtype='uint8')
img_val = np.random.randint(0,256,(21,64,256,256,1))
label_val = np.random.randint(0,2,(21,64))
model = Model()
outputs = model._build_model(img_batch, is_training, DATA_FORMAT)
loss, accuracy = model._build_losses(tf.cast(label_batch, tf.int64))
sgd = tf.train.AdamOptimizer()
global_step = tf.get_variable('global_step', dtype=tf.int32, shape=[], \
initializer=tf.constant_initializer(0))
train_op = sgd.minimize(loss, global_step)
init_op = tf.global_variables_initializer()
saver = tf.train.Saver(max_to_keep=10000)
with tf.Session() as sess:
sess.run(init_op)
_time = time.time()
_time = time.time()
# warm-up
for i in range(0, 20):
sess.run([loss, train_op], feed_dict={img_batch:img_val[i], \
label_batch:label_val[i]})
print time.time() - _time
# timeline generation
run_metadata = tf.RunMetadata()
_, l = sess.run([loss, train_op], feed_dict={img_batch:img_val[0],
label_batch:label_val[0]}, \
options=tf.RunOptions( \
trace_level=tf.RunOptions.FULL_TRACE), \
run_metadata=run_metadata)
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
with open('timeline-placeholder.ctf.json', 'w') as trace_file:
trace_file.write(trace.generate_chrome_trace_format())
使用'chrome:// tracing /'这里是我的时间表: 正如你所看到的那样,前进和后退之间存在巨大的(至少70%的时间)暂停,并且时间线上没有显示任何操作,这似乎是一个很大的浪费。
它来自哪里?有没有办法避免它? (我正在尝试使用队列,但没有帮助)
答案 0 :(得分:0)
我遇到了类似的问题。我发现你必须在gpu:0上运行分析代码才能获得正确的时间线。在gpu上运行:1或2或3将在跟踪显示中错过很多时间块。
希望它有所帮助。