Question

我有一个构建卷积模型，但却发现我的代码或Tensorflow代码中存在大量内存泄漏。任何人都可以发现问题并深入了解问题所在吗？

以下是一个可重复性最小的示例及其部分输出：

Process.py：

import Process

import time
import numpy as np
import os

import tensorflow as tf
from datetime import datetime

FLAGS = tf.app.flags.FLAGS


def train():
    with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

        images, labels = Process.inputs()

        forward_propgation_results = Process.forward_propagation(images)

        cost = Process.error(forward_propgation_results, labels)

        train_op = Process.train(cost, global_step)

        image_summary_t = tf.image_summary(images.name, images, max_images = 2)

        summary_op = tf.merge_all_summaries()

        init = tf.initialize_all_variables()

        saver = tf.train.Saver()

        sess = tf.InteractiveSession()

        sess.run(init)

        saver = tf.train.Saver(tf.all_variables())

        tf.train.start_queue_runners(sess = sess)

        train_dir = "/home/zan/nn-data"

        summary_writer = tf.train.SummaryWriter(train_dir, sess.graph)

        for step in xrange(650):
            start_time = time.time()
            _, loss_value = sess.run([train_op, cost])
            duration = time.time() - start_time

            assert not np.isnan(loss_value)

            if step % 1 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, (%.1f examples/sec; %.3f ''sec/batch)')
                print (format_str % (datetime.now(), step, examples_per_sec, sec_per_batch))

                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)


            if step % 20 or (step + 1) == 20:
                checkpoint_path = os.path.join(train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

def main(argv = None):
    train()

if __name__ == '__main__':
  tf.app.run()

main.py：

I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 960, pci bus id: 0000:03:00.0)

E tensorflow/stream_executor/cuda/cuda_driver.cc:997] failed to alloc 8589934592 bytes on host: CUDA_ERROR_OUT_OF_MEMORY
W ./tensorflow/core/common_runtime/gpu/pool_allocator.h:195] could not allocate pinned host memory of size: 8589934592
E tensorflow/stream_executor/cuda/cuda_driver.cc:997] failed to alloc 7730940928 bytes on host: CUDA_ERROR_OUT_OF_MEMORY
W ./tensorflow/core/common_runtime/gpu/pool_allocator.h:195] could not allocate pinned host memory of size: 7730940928

幸运的是，我能够及时杀死这个过程，以下是程序记录的输出。

输出：

"\n\t\t\t\t\t\t14\n\t\t\t\t\t\t_You Won\n\t\t\t\t\t"

Tensorflow Memory Leak无法分配固定大小的固定主机内存

0 个答案: