我正在使用tensorflow为我的NLP模型编写数据管道,因为我无法使用feed dict来利用我的GPU,所以我在虚拟数据上测试我的代码但它在CPU上工作得很好但是当我在GPU上运行时它非常慢然后我注意到它因为Enqueue操作非常慢所以GPU必须等待数据。
当我打印队列大小 - 0或分钟
时def data_iterator():
while True:
yield np.random.standard_normal((128, 1024)), np.array([[1]]*128)
class CustomRunner(object):
"""
This class manages the the background threads needed to fill
a queue full of data.
"""
def __init__(self):
self.dataX = tf.placeholder(dtype=tf.float32, shape=[None, 1024])
self.dataY = tf.placeholder(dtype=tf.int32, shape=[None, 1])
self.queue = tf.RandomShuffleQueue(shapes=[[1024], [1]],
dtypes=[tf.float32, tf.int32],
capacity=2000,
min_after_dequeue=1000)
self.enqueue_op = self.queue.enqueue_many([self.dataX, self.dataY])
self.size_op = self.queue.size()
def get_inputs(self):
images_batch, labels_batch = self.queue.dequeue_many(128)
return images_batch, labels_batch
def thread_main(self, sess):
for dataX, dataY in data_iterator():
print("QueueSize = %i" % (sess.run(self.size_op)))
sess.run(self.enqueue_op, feed_dict={self.dataX:dataX, self.dataY:dataY})
def start_threads(self, sess, n_threads=2):
threads = []
for n in range(n_threads):
t = threading.Thread(target=self.thread_main, args=(sess,))
t.daemon = True # thread will close when parent quits
t.start()
threads.append(t)
return threads
with tf.device("/cpu:0"):
custom_runner = CustomRunner()
input, y_true = custom_runner.get_inputs()
with tf.variable_scope('FullyConnected'):
w = tf.get_variable('w', shape=[1024, 1024], initializer=tf.random_normal_initializer(stddev=1e-1))
b = tf.get_variable('b', shape=[1024], initializer=tf.constant_initializer(0.1))
z = tf.matmul(input, w) + b
y = tf.nn.relu(z)
w2 = tf.get_variable('w2', shape=[1024, 1], initializer=tf.random_normal_initializer(stddev=1e-1))
b2 = tf.get_variable('b2', shape=[1], initializer=tf.constant_initializer(0.1))
z = tf.matmul(y, w2) + b2
with tf.variable_scope('Loss'):
losses = tf.nn.sigmoid_cross_entropy_with_logits(None, tf.cast(y_true, tf.float32), z)
loss_op = tf.reduce_mean(losses)
with tf.variable_scope('Accuracy'):
y_pred = tf.cast(z > 0, tf.int32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
accuracy = tf.Print(accuracy, data=[accuracy], message="accuracy:")
# We add the training op ...
adam = tf.train.AdamOptimizer(1e-2)
train_op = adam.minimize(loss_op, name="train_op")
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=8))
init = tf.initialize_all_variables()
sess.run(init)
# start the tensorflow QueueRunner's
tf.train.start_queue_runners(sess=sess)
# start our custom queue runner's threads
custom_runner.start_threads(sess)
startTime = time.time()
for i in range(5000):
#run_options = tf.RunOptions(timeout_in_ms=4000)
_, loss = sess.run([train_op, loss_op])
if i % 500 == 0:
print('iter:%d - loss:%f' % (i, loss))
print("Time taken: %f" % (time.time() - startTime))