我试图找出为什么下面的代码在迭代迭代期间发生了巨大的内存泄漏。这是整个代码。
def train_network(file_folder, file_list, hm_epochs, batch_size):
num_files = len(file_list)
with g.as_default():
input_image = tf.placeholder(tf.float32, shape=[1, 40, 200, 300, 3])
y1 = tf.placeholder(tf.int32)
y2 = tf.placeholder(tf.float32)
class_logit, highlight_logit = convolutional_neural_network(input_image)
class_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=class_logit, labels=y1))
highlight_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=highlight_logit, labels=y2))
optimizer1 = tf.train.RMSPropOptimizer(learning_rate=1e-6).minimize(class_loss, centered=True)
optimizer2 = tf.train.RMSPropOptimizer(learning_rate=1e-7).minimize(highlight_loss, centered=True)
#### Saving Network ####
with tf.Session(graph=g) as sess:
saver = tf.train.Saver(max_to_keep = 3)
sess.run(tf.global_variables_initializer())
for epoch in xrange(hm_epochs):
epoch_loss = 0
for idx in xrange(num_files):
_file = file_folder + '/' + file_list[idx]
X_total, Y1_class, Y2_score = read_as_batch(_file)
n_batch = int(X_total.shape[0]/batch_size)
for i in xrange(n_batch):
batch_X = get_batch_piece(X_total, batch_size, i)
batch_Y1 = get_batch_piece(Y1_class, batch_size, i)
batch_Y2 = get_batch_piece(Y2_score, batch_size, i)
_, _, a, b, c, d = sess.run([optimizer1, optimizer2, class_loss, highlight_loss, tf.gather(class_logit, 0), tf.gather(highlight_logit, 0)], feed_dict={input_image: batch_X, y1: batch_Y1, y2: batch_Y2})
result = float(a) + float(b)
del a, b, batch_X, batch_Y1, batch_Y2
epoch_loss += result
del c, d
gc.collect()
ckpt_path = saver.save(sess, "saved/train", epoch)
以下是内存分析器结果。我发现函数 read_as_batch 和 get_batch_piece 并不是几次实验导致内存泄漏的原因。
35 215.758 MiB 0.000 MiB @profile
36 def train_network(file_folder, file_list, hm_epochs, batch_size):
37
38 215.758 MiB 0.000 MiB num_files = len(file_list)
44 215.758 MiB 0.000 MiB with g.as_default():
45
46 216.477 MiB 0.719 MiB input_image = tf.placeholder(tf.float32, shape=[1, 40, 200, 300, 3])
47 216.477 MiB 0.000 MiB y1 = tf.placeholder(tf.int32)
48 216.477 MiB 0.000 MiB y2 = tf.placeholder(tf.float32)
49
50 220.199 MiB 3.723 MiB class_logit, highlight_logit = convolutional_neural_network(input_image)
51
52 220.711 MiB 0.512 MiB class_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=class_logit, labels=y1))
54 220.953 MiB 0.242 MiB highlight_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=highlight_logit, labels=y2))
55
56 227.562 MiB 6.609 MiB optimizer1 = tf.train.RMSPropOptimizer(learning_rate=1e-6).minimize(class_loss)
57 234.062 MiB 6.500 MiB optimizer2 = tf.train.RMSPropOptimizer(learning_rate=1e-7).minimize(highlight_loss)
58
59 #### Saving Network ####
60 660.691 MiB 426.629 MiB with tf.Session(graph=g) as sess:
62 666.848 MiB 6.156 MiB saver = tf.train.Saver(max_to_keep = 3)
63 1183.676 MiB 516.828 MiB sess.run(tf.global_variables_initializer())
67 1642.145 MiB 458.469 MiB for epoch in xrange(hm_epochs):
68 1642.145 MiB 0.000 MiB epoch_loss = 0
69 1642.145 MiB 0.000 MiB file_list_ = iter(file_list)
71 #for idx in xrange(num_files):
74 1642.145 MiB 0.000 MiB _file = file_folder + '/' + file_list_.next()
77 1779.477 MiB 137.332 MiB data = np.load(_file)
78 # Batch Data Generation
79 1916.629 MiB 137.152 MiB X_total = np.array([data[0][0][0], data[0][0][1], ...])
81 # Class, Score Data Fetching
82 1916.629 MiB 0.000 MiB Y1_class = data[0][1][0]
83 1916.629 MiB 0.000 MiB Y2_score = data[0][2][0]
85 1916.629 MiB 0.000 MiB batch_X = get_batch_piece(X_total, 1, 1)
86 1916.629 MiB 0.000 MiB batch_Y1 = get_batch_piece(Y1_class, 1, 1)
87 1916.629 MiB 0.000 MiB batch_Y2 = get_batch_piece(Y2_score, 1, 1)
88 1916.805 MiB 0.176 MiB _ = sess.run([optimizer1], feed_dict={input_image: batch_X, y1: batch_Y1, y2: batch_Y2})
89
90 1642.145 MiB -274.660 MiB del data, X_total, Y1_class, Y2_score, batch_X, batch_Y1, batch_Y2, optimizer1
为了提高可读性,我缩短了代码。即使内存分析结果与原始代码略有不同,它也是相同的并且出现相同的问题(内存泄漏)。 事实是,当我删除 sess.run([optimizer1],...)时,即使epoch超过100,代码也不会泄漏内存。但是,我运行的情况会话,内存使用量越来越大,所以即使使用纪元5我也不能再进行训练了。
我需要你的帮助。谢谢。
答案 0 :(得分:1)
原因是您在每次会话调用时都会创建新 tensorflow操作。
将这两个移出for循环tf.gather(class_logit, 0), tf.gather(highlight_logit, 0)
,问题应该消失。