我正在训练Inception v1模块进行图像分类,并在session.run(optimizer,feed_dict = {...})部分中获取内存泄漏
Tensorflow memory leak in every iteration ,问题很相似,但我想我没有在图中添加任何内容。
x = tf.placeholder(tf.float32, [None, 224, 224, 3])
y = tf.placeholder(tf.int32, [None, 11])
keep_prob = tf.placeholder_with_default(1.0, shape=())
global_step = tf.Variable(0, trainable=False)
logits_dict = inception_v1.inception_module(x, keep_prob)
with tf.name_scope('cross_entropy'):
cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_dict['logits'],
labels=y,
name='cross_entropy_loss'))
with tf.name_scope('aux_loss_0'):
aux_loss_0 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_dict['aux_0'], labels=y,
name='aux_loss_0'))
with tf.name_scope('aux_loss_1'):
aux_loss_1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_dict['aux_1'], labels=y,
name='aux_loss_1'))
with tf.name_scope('loss'):
loss = cross_entropy_loss + 0.3*aux_loss_0 + 0.3*aux_loss_1
tf.summary.scalar('loss', loss)
with tf.device('/gpu:2'):
with tf.name_scope('optimizer'):
optimizer = tf.train.MomentumOptimizer(learning_rate=tf.train.cosine_decay_restarts(learning_rate,
global_step, 1000), momentum=0.9).minimize(loss,
global_step=global_step)
with tf.name_scope('accuracy'):
correct = tf.equal(tf.argmax(logits_dict['logits'], 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
tf.summary.scalar('accuracy', accuracy)
y_hat = tf.argmax(logits_dict['logits'], 1)
# merger summaries to write into the file
merged = tf.summary.merge_all()
# Check point saver
saver = tf.train.Saver()
init = tf.global_variables_initializer()
print("Training CNN......")
gpu_options = tf.GPUOptions()
gpu_options.per_process_gpu_memory_fraction = 0.7
gpu_options.allow_growth = True
best_validation_accuracy = 0
with tf.Session(config=tf.ConfigProto(
allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options)) as sess:
init.run()
n_train_batches = len(train_x) // batch_size
n_validation_batches = len(validate_x) // batch_size
train_writer = tf.summary.FileWriter(graph_path + '/train', sess.graph)
val_writer = tf.summary.FileWriter(graph_path + '/validation')
for epoch in tqdm(range(training_epochs)):
print("Current Virtual Memory Status:", psutil.virtual_memory())
for t_batch in range(n_train_batches):
train_x_batch, train_y_batch = batch_object_train.next_batch(batch_size)
sess.run(optimizer, feed_dict={x: train_x_batch, y: train_y_batch, keep_prob: 0.6})
c, a = sess.run([loss, accuracy], feed_dict={x: train_x_batch, y: train_y_batch, keep_prob: 1.0})
print("Training -> Epoch:{}, Batch:{}, Loss:{}, "
"Accuracy:{}".format(epoch, t_batch, round(c*100, 2), round(a*100, 2)))
summary = sess.run(merged, feed_dict={x: train_x_batch, y: train_y_batch, keep_prob: 1.0})
train_writer.add_summary(summary, epoch + n_train_batches * t_batch)
训练可以进行到2次,但随后内存不足。