我有一个构建卷积模型,但却发现我的代码或Tensorflow代码中存在大量内存泄漏。任何人都可以发现问题并深入了解问题所在吗?
以下是一个可重复性最小的示例及其部分输出:
Process.py:
import Process
import time
import numpy as np
import os
import tensorflow as tf
from datetime import datetime
FLAGS = tf.app.flags.FLAGS
def train():
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable=False)
images, labels = Process.inputs()
forward_propgation_results = Process.forward_propagation(images)
cost = Process.error(forward_propgation_results, labels)
train_op = Process.train(cost, global_step)
image_summary_t = tf.image_summary(images.name, images, max_images = 2)
summary_op = tf.merge_all_summaries()
init = tf.initialize_all_variables()
saver = tf.train.Saver()
sess = tf.InteractiveSession()
sess.run(init)
saver = tf.train.Saver(tf.all_variables())
tf.train.start_queue_runners(sess = sess)
train_dir = "/home/zan/nn-data"
summary_writer = tf.train.SummaryWriter(train_dir, sess.graph)
for step in xrange(650):
start_time = time.time()
_, loss_value = sess.run([train_op, cost])
duration = time.time() - start_time
assert not np.isnan(loss_value)
if step % 1 == 0:
num_examples_per_step = FLAGS.batch_size
examples_per_sec = num_examples_per_step / duration
sec_per_batch = float(duration)
format_str = ('%s: step %d, (%.1f examples/sec; %.3f ''sec/batch)')
print (format_str % (datetime.now(), step, examples_per_sec, sec_per_batch))
summary_str = sess.run(summary_op)
summary_writer.add_summary(summary_str, step)
if step % 20 or (step + 1) == 20:
checkpoint_path = os.path.join(train_dir, 'model.ckpt')
saver.save(sess, checkpoint_path, global_step=step)
def main(argv = None):
train()
if __name__ == '__main__':
tf.app.run()
main.py:
I tensorflow/core/common_runtime/gpu/gpu_device.cc:838] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 960, pci bus id: 0000:03:00.0)
E tensorflow/stream_executor/cuda/cuda_driver.cc:997] failed to alloc 8589934592 bytes on host: CUDA_ERROR_OUT_OF_MEMORY
W ./tensorflow/core/common_runtime/gpu/pool_allocator.h:195] could not allocate pinned host memory of size: 8589934592
E tensorflow/stream_executor/cuda/cuda_driver.cc:997] failed to alloc 7730940928 bytes on host: CUDA_ERROR_OUT_OF_MEMORY
W ./tensorflow/core/common_runtime/gpu/pool_allocator.h:195] could not allocate pinned host memory of size: 7730940928
幸运的是,我能够及时杀死这个过程,以下是程序记录的输出。
输出:
"\n\t\t\t\t\t\t14\n\t\t\t\t\t\t_You Won\n\t\t\t\t\t"