批处理数据生成器如下:
BATCH_SIZE=512
HEIGHT=128
WIDTH=128
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'X': tf.FixedLenFeature([HEIGHT, WIDTH, 1], tf.float32),
'y': tf.FixedLenFeature([48], tf.int64),
})
image = tf.cast(features['X'], tf.float32)
label = tf.cast(features['y'], tf.float32)
return image, label
def inputs(filenames, batch_size, num_epochs, shuffle=True):
if num_epochs==0:
num_epochs=None
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(filenames,
shuffle=shuffle,
capacity=16*batch_size,
num_epochs=num_epochs)
image, label = read_and_decode(filename_queue)
if shuffle:
images, sparse_labels = tf.train.shuffle_batch([image, label],
batch_size=batch_size,
num_threads=4,
capacity= 32 * batch_size,
min_after_dequeue= 1 * batch_size)
else:
images, sparse_labels = tf.train.batch([image, label],
batch_size=batch_size,
num_threads=1,
capacity= 1 * batch_size,
allow_smaller_final_batch=True)
return images, sparse_labels
主要消费者代码如下:
t0=time.time()
image_val, label_val = sess.run([image_op, label_op])
t1=time.time()
_,output_val,loss_val,summary_val=sess.run([optimizer,output,loss,summary_op], feed_dict={image_ph:image_val, label_ph:label_val, phase_train:True})
t2=time.time()
print(global_step, loss_val, t1-t0, t2-t1)
部分结果如下:
(1, 4.7936598e+08, 1.6970429420471191, 1.16087007522583)
(2, 4.7492218e+08, 2.3166861534118652, 1.1548128128051758)
(3, 4.8047462e+08, 2.3188629150390625, 1.1514790058135986)
(4, 4.8164006e+08, 2.533250093460083, 1.150022029876709)
(5, 4.9936403e+08, 2.5249040126800537, 1.1556730270385742)
(6, 5.0953357e+08, 2.3015151023864746, 1.1535239219665527)
(7, 4.6024166e+08, 2.1644229888916016, 1.159350872039795)
(8, 4.7846029e+08, 2.4160208702087402, 1.1518781185150146)
显然,数据加载的时间比网络执行的时间长得多。我尝试过capacity
,num_threads
和min_after_dequeue
的不同组合。 CPU使用率始终小于100%
(约70%~80%
),这表示未启动多线程。为什么呢?
据我了解,数据预取的队列应该始终是满的,这样训练可以更快。但遗憾的是,似乎没有。如果使用队列的数据预取工作良好,则CPU使用率应该很高,因为它忙于数据预取,同时数据很快从队列中出队,然后被送入网络进行训练。它很奇怪。 tf.train.shuffle_batch
中有错误吗?为什么CPU没有忙于数据预取?