Question

我正在使用tensorflow为我的NLP模型编写数据管道，因为我无法使用feed dict来利用我的GPU，所以我在虚拟数据上测试我的代码但它在CPU上工作得很好但是当我在GPU上运行时它非常慢然后我注意到它因为Enqueue操作非常慢所以GPU必须等待数据。

当我打印队列大小 - 0或分钟

时

def data_iterator():
    while True:
    yield np.random.standard_normal((128, 1024)), np.array([[1]]*128)


class CustomRunner(object):
    """
    This class manages the the background threads needed to fill
        a queue full of data.
    """
    def __init__(self):
        self.dataX = tf.placeholder(dtype=tf.float32, shape=[None, 1024])
        self.dataY = tf.placeholder(dtype=tf.int32, shape=[None, 1])

        self.queue = tf.RandomShuffleQueue(shapes=[[1024], [1]],
                                           dtypes=[tf.float32, tf.int32],
                                           capacity=2000,
                                           min_after_dequeue=1000)

        self.enqueue_op = self.queue.enqueue_many([self.dataX, self.dataY])
    self.size_op = self.queue.size()

    def get_inputs(self):
        images_batch, labels_batch = self.queue.dequeue_many(128)
        return images_batch, labels_batch

    def thread_main(self, sess):
        for dataX, dataY in data_iterator():
        print("QueueSize = %i" % (sess.run(self.size_op)))
            sess.run(self.enqueue_op, feed_dict={self.dataX:dataX, self.dataY:dataY})

    def start_threads(self, sess, n_threads=2):
        threads = []
        for n in range(n_threads):
            t = threading.Thread(target=self.thread_main, args=(sess,))
            t.daemon = True # thread will close when parent quits
            t.start()
            threads.append(t)
        return threads


with tf.device("/cpu:0"):
    custom_runner = CustomRunner()
    input, y_true = custom_runner.get_inputs()

with tf.variable_scope('FullyConnected'):
    w = tf.get_variable('w', shape=[1024, 1024], initializer=tf.random_normal_initializer(stddev=1e-1))
    b = tf.get_variable('b', shape=[1024], initializer=tf.constant_initializer(0.1))
    z = tf.matmul(input, w) + b
    y = tf.nn.relu(z)

    w2 = tf.get_variable('w2', shape=[1024, 1], initializer=tf.random_normal_initializer(stddev=1e-1))
    b2 = tf.get_variable('b2', shape=[1], initializer=tf.constant_initializer(0.1))
    z = tf.matmul(y, w2) + b2

with tf.variable_scope('Loss'):
    losses = tf.nn.sigmoid_cross_entropy_with_logits(None, tf.cast(y_true, tf.float32), z)
    loss_op = tf.reduce_mean(losses)

with tf.variable_scope('Accuracy'):
    y_pred = tf.cast(z > 0, tf.int32)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred, y_true), tf.float32))
    accuracy = tf.Print(accuracy, data=[accuracy], message="accuracy:")

# We add the training op ...
adam = tf.train.AdamOptimizer(1e-2)
train_op = adam.minimize(loss_op, name="train_op")

sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=8))
init = tf.initialize_all_variables()
sess.run(init)

# start the tensorflow QueueRunner's
tf.train.start_queue_runners(sess=sess)
# start our custom queue runner's threads
custom_runner.start_threads(sess)

startTime = time.time()
for i in range(5000):
    #run_options = tf.RunOptions(timeout_in_ms=4000)
    _, loss = sess.run([train_op, loss_op])
    if i % 500 == 0:
        print('iter:%d - loss:%f' % (i, loss))

print("Time taken: %f" % (time.time() - startTime))

Tensorflow入队操作非常慢

0 个答案: