Question

我正在训练Inception v1模块进行图像分类，并在session.run（optimizer，feed_dict = {...}）部分中获取内存泄漏

Tensorflow memory leak in every iteration ，问题很相似，但我想我没有在图中添加任何内容。

x = tf.placeholder(tf.float32, [None, 224, 224, 3])
y = tf.placeholder(tf.int32, [None, 11])
keep_prob = tf.placeholder_with_default(1.0, shape=())
global_step = tf.Variable(0, trainable=False)

logits_dict = inception_v1.inception_module(x, keep_prob)

with tf.name_scope('cross_entropy'):
    cross_entropy_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_dict['logits'],
                                                                                   labels=y,
                                                                                   name='cross_entropy_loss'))

with tf.name_scope('aux_loss_0'):
    aux_loss_0 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_dict['aux_0'], labels=y,
                                                                           name='aux_loss_0'))
with tf.name_scope('aux_loss_1'):
    aux_loss_1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits_dict['aux_1'], labels=y,
                                                                           name='aux_loss_1'))
with tf.name_scope('loss'):
    loss = cross_entropy_loss + 0.3*aux_loss_0 + 0.3*aux_loss_1
    tf.summary.scalar('loss', loss)
with tf.device('/gpu:2'):
    with tf.name_scope('optimizer'):
        optimizer = tf.train.MomentumOptimizer(learning_rate=tf.train.cosine_decay_restarts(learning_rate,
                                               global_step, 1000), momentum=0.9).minimize(loss,
                                                                                          global_step=global_step)

with tf.name_scope('accuracy'):
    correct = tf.equal(tf.argmax(logits_dict['logits'], 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

y_hat = tf.argmax(logits_dict['logits'], 1)
# merger summaries to write into the file
merged = tf.summary.merge_all()

# Check point saver
saver = tf.train.Saver()

init = tf.global_variables_initializer()
print("Training CNN......")
gpu_options = tf.GPUOptions()
gpu_options.per_process_gpu_memory_fraction = 0.7
gpu_options.allow_growth = True
best_validation_accuracy = 0

with tf.Session(config=tf.ConfigProto(
  allow_soft_placement=True, log_device_placement=True, gpu_options=gpu_options)) as sess:
    init.run()
    n_train_batches = len(train_x) // batch_size
    n_validation_batches = len(validate_x) // batch_size
    train_writer = tf.summary.FileWriter(graph_path + '/train', sess.graph)
    val_writer = tf.summary.FileWriter(graph_path + '/validation')
    for epoch in tqdm(range(training_epochs)):
        print("Current Virtual Memory Status:", psutil.virtual_memory())
        for t_batch in range(n_train_batches):
            train_x_batch, train_y_batch = batch_object_train.next_batch(batch_size)
            sess.run(optimizer, feed_dict={x: train_x_batch, y: train_y_batch, keep_prob: 0.6})
            c, a = sess.run([loss, accuracy], feed_dict={x: train_x_batch, y: train_y_batch, keep_prob: 1.0})
            print("Training -> Epoch:{}, Batch:{}, Loss:{}, "
                  "Accuracy:{}".format(epoch, t_batch, round(c*100, 2), round(a*100, 2)))
            summary = sess.run(merged, feed_dict={x: train_x_batch, y: train_y_batch, keep_prob: 1.0})
            train_writer.add_summary(summary, epoch + n_train_batches * t_batch)

训练可以进行到2次，但随后内存不足。

张量流会话中的内存泄漏。前2个时期后出现MemoryError

0 个答案: