问题:输入准备期间发生OOM错误。是因为num_classes
很大而且choose_from_datasets
占用了大量的cpu内存吗?详情如下。
系统配置:Ubuntu-16.04,tf-nightly-1.10,python-3.6
import tensorflow as tf
from tensorflow.contrib.data.python.ops.interleave_ops import choose_from_datasets
num_classes = 1000
num_classes_per_batch = 32
num_images_per_class = 2
def generator(num_classes, num_classes_per_batch, num_images_per_class):
# Sample `num_classes_per_batch` classes for the batch
sampled = tf.random_shuffle(tf.range(num_classes))[:num_classes_per_batch]
# Repeat each element `num_images_per_class` times
batch_labels = tf.tile(tf.expand_dims(sampled, -1), [1, num_images_per_class])
return tf.to_int64(tf.reshape(batch_labels, [-1]))
#2.1 get datasets
datasets = [tf.data.TFRecordDataset(f).apply(tf.contrib.data.shuffle_and_repeat(buffer_size=5)) for f in tfrecords_files]
#2.2 prepare selector
selector = tf.contrib.data.Counter().map(
lambda _: generator(num_classes, num_classes_per_batch, max_num_images_per_class))
selector = selector.apply(tf.contrib.data.unbatch())
#2.3 prepare dataset
BATCH_SIZE = tf.cast(num_classes_per_batch * num_images_per_class, tf.int64)
dataset = choose_from_datasets(datasets, selector)
dataset = dataset.map(map_func=decode, num_parallel_calls=num_parallel_calls)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=1)
#2.4 prepare iterator
#iterator = dataset.make_initializable_iterator()
iterator = dataset.make_one_shot_iterator()
next_batch = iterator.get_next()
当上面的输入代码运行大约100步时,发生以下问题:
tensorflow.python.framework.errors_impl.ResourceExhaustedError:/data1/face/dataset/teng/msceleb1m_per_class_test/168.tfrecords;打开的文件太多了 [[Node:IteratorGetNext = IteratorGetNextoutput_shapes = [[?,112,112,?],[?]],output_types = [DT_FLOAT,DT_INT64],_ device =" / job:localhost / replica:0 / task:0 / device :CPU:0"]] 提示:如果要在OOM发生时查看已分配的张量列表,请将report_tensor_allocations_upon_oom添加到RunOptions以获取当前分配信息。