从this存储库(评估脚本- evaluate.py )使用DeepLabV3 +运行语义分段时,内存不足(OOM)。
一个奇怪的方面是我已经成功地训练了网络,但是我是
无法评估。我的批处理大小为1,因此无法进一步减小(对于其他有OOM问题的用户来说是解决方案)。
出于测试目的,我对单个图像进行了评估,效果很好。但是,当我为文件夹分配完整的数据集(约50张图像)时,问题再次出现。
我怀疑问题出在 utils / preprocessing / eval_input_fn()方法中,该方法尝试读取所有数据并将其解析为浮点数第234、244行
任何有关如何进行的想法都非常受欢迎。 我还将以下两个脚本(evaluate.py,preprocessing.py)和错误附在下面。
*硬件:4xTitan Xp 12GB --->代码仅使用一个
"""Evaluate a DeepLab v3 model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import sys
import tensorflow as tf
import deeplab_model
from utils import preprocessing
from utils import dataset_util
import numpy as np
import timeit
parser = argparse.ArgumentParser()
parser.add_argument('--image_data_dir', type=str, default='dataset/VOCdevkit/VOC2012/JPEGImages',
help='The directory containing the image data.')
parser.add_argument('--label_data_dir', type=str, default='dataset/VOCdevkit/VOC2012/SegmentationClassAug',
help='The directory containing the ground truth label data.')
parser.add_argument('--evaluation_data_list', type=str, default='./dataset/val.txt',
help='Path to the file listing the evaluation images.')
parser.add_argument('--model_dir', type=str, default='./model',
help="Base directory for the model. "
"Make sure 'model_checkpoint_path' given in 'checkpoint' file matches "
"with checkpoint name.")
parser.add_argument('--base_architecture', type=str, default='resnet_v2_101',
choices=['resnet_v2_50', 'resnet_v2_101'],
help='The architecture of base Resnet building block.')
parser.add_argument('--output_stride', type=int, default=16,
choices=[8, 16],
help='Output stride for DeepLab v3. Currently 8 or 16 is supported.')
_NUM_CLASSES = 21
def main(unused_argv):
# Using the Winograd non-fused algorithms provides a small performance boost.
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
examples = dataset_util.read_examples_list(FLAGS.evaluation_data_list)
image_files = [os.path.join(FLAGS.image_data_dir, filename) + '.jpg' for filename in examples]
label_files = [os.path.join(FLAGS.label_data_dir, filename) + '.png' for filename in examples]
features, labels = preprocessing.eval_input_fn(image_files, label_files)
predictions = deeplab_model.deeplabv3_plus_model_fn(
features,
labels,
tf.estimator.ModeKeys.EVAL,
params={
'output_stride': FLAGS.output_stride,
'batch_size': 1, # Batch size must be 1 because the images' size may differ
'base_architecture': FLAGS.base_architecture,
'pre_trained_model': None,
'batch_norm_decay': None,
'num_classes': _NUM_CLASSES,
'freeze_batch_norm': True
}).predictions
# Manually load the latest checkpoint
saver = tf.train.Saver()
with tf.Session() as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
saver.restore(sess, ckpt.model_checkpoint_path)
# Loop through the batches and store predictions and labels
step = 1
sum_cm = np.zeros((_NUM_CLASSES, _NUM_CLASSES), dtype=np.int32)
start = timeit.default_timer()
while True:
try:
preds = sess.run(predictions)
sum_cm += preds['confusion_matrix']
if not step % 100:
stop = timeit.default_timer()
tf.logging.info("current step = {} ({:.3f} sec)".format(step, stop-start))
start = timeit.default_timer()
step += 1
except tf.errors.OutOfRangeError:
break
def compute_mean_iou(total_cm):
"""Compute the mean intersection-over-union via the confusion matrix."""
sum_over_row = np.sum(total_cm, axis=0).astype(float)
sum_over_col = np.sum(total_cm, axis=1).astype(float)
cm_diag = np.diagonal(total_cm).astype(float)
denominator = sum_over_row + sum_over_col - cm_diag
# The mean is only computed over classes that appear in the
# label or prediction tensor. If the denominator is 0, we need to
# ignore the class.
num_valid_entries = np.sum((denominator != 0).astype(float))
# If the value of the denominator is 0, set it to 1 to avoid
# zero division.
denominator = np.where(
denominator > 0,
denominator,
np.ones_like(denominator))
ious = cm_diag / denominator
print('Intersection over Union for each class:')
for i, iou in enumerate(ious):
print(' class {}: {:.4f}'.format(i, iou))
# If the number of valid entries is 0 (no classes) we return 0.
m_iou = np.where(
num_valid_entries > 0,
np.sum(ious) / num_valid_entries,
0)
m_iou = float(m_iou)
print('mean Intersection over Union: {:.4f}'.format(float(m_iou)))
def compute_accuracy(total_cm):
"""Compute the accuracy via the confusion matrix."""
denominator = total_cm.sum().astype(float)
cm_diag_sum = np.diagonal(total_cm).sum().astype(float)
# If the number of valid entries is 0 (no classes) we return 0.
accuracy = np.where(
denominator > 0,
cm_diag_sum / denominator,
0)
accuracy = float(accuracy)
print('Pixel Accuracy: {:.4f}'.format(float(accuracy)))
compute_mean_iou(sum_cm)
compute_accuracy(sum_cm)
if __name__ == '__main__':
tf.logging.set_verbosity(tf.logging.INFO)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
"""Utility functions for preprocessing data sets."""
from PIL import Image
import numpy as np
import tensorflow as tf
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
# colour map
label_colours = [(0, 0, 0), # 0=background
# 1=aeroplane, 2=bicycle, 3=bird, 4=boat, 5=bottle
(128, 0, 0), (0, 128, 0), (128, 128, 0), (0, 0, 128), (128, 0, 128),
# 6=bus, 7=car, 8=cat, 9=chair, 10=cow
(0, 128, 128), (128, 128, 128), (64, 0, 0), (192, 0, 0), (64, 128, 0),
# 11=dining table, 12=dog, 13=horse, 14=motorbike, 15=person
(192, 128, 0), (64, 0, 128), (192, 0, 128), (64, 128, 128), (192, 128, 128),
# 16=potted plant, 17=sheep, 18=sofa, 19=train, 20=tv/monitor
(0, 64, 0), (128, 64, 0), (0, 192, 0), (128, 192, 0), (0, 64, 128)]
def decode_labels(mask, num_images=1, num_classes=21):
"""Decode batch of segmentation masks.
Args:
mask: result of inference after taking argmax.
num_images: number of images to decode from the batch.
num_classes: number of classes to predict (including background).
Returns:
A batch with num_images RGB images of the same size as the input.
"""
n, h, w, c = mask.shape
assert (n >= num_images), 'Batch size %d should be greater or equal than number of images to save %d.' \
% (n, num_images)
outputs = np.zeros((num_images, h, w, 3), dtype=np.uint8)
for i in range(num_images):
img = Image.new('RGB', (len(mask[i, 0]), len(mask[i])))
pixels = img.load()
for j_, j in enumerate(mask[i, :, :, 0]):
for k_, k in enumerate(j):
if k < num_classes:
pixels[k_, j_] = label_colours[k]
outputs[i] = np.array(img)
return outputs
def mean_image_addition(image, means=(_R_MEAN, _G_MEAN, _B_MEAN)):
"""Adds the given means from each image channel.
For example:
means = [123.68, 116.779, 103.939]
image = _mean_image_subtraction(image, means)
Note that the rank of `image` must be known.
Args:
image: a tensor of size [height, width, C].
means: a C-vector of values to subtract from each channel.
Returns:
the centered image.
Raises:
ValueError: If the rank of `image` is unknown, if `image` has a rank other
than three or if the number of channels in `image` doesn't match the
number of values in `means`.
"""
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
num_channels = image.get_shape().as_list()[-1]
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
for i in range(num_channels):
channels[i] += means[i]
return tf.concat(axis=2, values=channels)
def mean_image_subtraction(image, means=(_R_MEAN, _G_MEAN, _B_MEAN)):
"""Subtracts the given means from each image channel.
For example:
means = [123.68, 116.779, 103.939]
image = _mean_image_subtraction(image, means)
Note that the rank of `image` must be known.
Args:
image: a tensor of size [height, width, C].
means: a C-vector of values to subtract from each channel.
Returns:
the centered image.
Raises:
ValueError: If the rank of `image` is unknown, if `image` has a rank other
than three or if the number of channels in `image` doesn't match the
number of values in `means`.
"""
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
num_channels = image.get_shape().as_list()[-1]
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
channels = tf.split(axis=2, num_or_size_splits=num_channels, value=image)
for i in range(num_channels):
channels[i] -= means[i]
return tf.concat(axis=2, values=channels)
def random_rescale_image_and_label(image, label, min_scale, max_scale):
"""Rescale an image and label with in target scale.
Rescales an image and label within the range of target scale.
Args:
image: 3-D Tensor of shape `[height, width, channels]`.
label: 3-D Tensor of shape `[height, width, 1]`.
min_scale: Min target scale.
max_scale: Max target scale.
Returns:
Cropped and/or padded image.
If `images` was 3-D, a 3-D float Tensor of shape
`[new_height, new_width, channels]`.
If `labels` was 3-D, a 3-D float Tensor of shape
`[new_height, new_width, 1]`.
"""
if min_scale <= 0:
raise ValueError('\'min_scale\' must be greater than 0.')
elif max_scale <= 0:
raise ValueError('\'max_scale\' must be greater than 0.')
elif min_scale >= max_scale:
raise ValueError('\'max_scale\' must be greater than \'min_scale\'.')
shape = tf.shape(image)
height = tf.to_float(shape[0])
width = tf.to_float(shape[1])
scale = tf.random_uniform(
[], minval=min_scale, maxval=max_scale, dtype=tf.float32)
new_height = tf.to_int32(height * scale)
new_width = tf.to_int32(width * scale)
image = tf.image.resize_images(image, [new_height, new_width],
method=tf.image.ResizeMethod.BILINEAR)
# Since label classes are integers, nearest neighbor need to be used.
label = tf.image.resize_images(label, [new_height, new_width],
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
return image, label
def random_crop_or_pad_image_and_label(image, label, crop_height, crop_width, ignore_label):
"""Crops and/or pads an image to a target width and height.
Resizes an image to a target width and height by rondomly
cropping the image or padding it evenly with zeros.
Args:
image: 3-D Tensor of shape `[height, width, channels]`.
label: 3-D Tensor of shape `[height, width, 1]`.
crop_height: The new height.
crop_width: The new width.
ignore_label: Label class to be ignored.
Returns:
Cropped and/or padded image.
If `images` was 3-D, a 3-D float Tensor of shape
`[new_height, new_width, channels]`.
"""
label = label - ignore_label # Subtract due to 0 padding.
label = tf.to_float(label)
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
image_and_label = tf.concat([image, label], axis=2)
image_and_label_pad = tf.image.pad_to_bounding_box(
image_and_label, 0, 0,
tf.maximum(crop_height, image_height),
tf.maximum(crop_width, image_width))
image_and_label_crop = tf.random_crop(
image_and_label_pad, [crop_height, crop_width, 4])
image_crop = image_and_label_crop[:, :, :3]
label_crop = image_and_label_crop[:, :, 3:]
label_crop += ignore_label
label_crop = tf.to_int32(label_crop)
return image_crop, label_crop
def random_flip_left_right_image_and_label(image, label):
"""Randomly flip an image and label horizontally (left to right).
Args:
image: A 3-D tensor of shape `[height, width, channels].`
label: A 3-D tensor of shape `[height, width, 1].`
Returns:
A 3-D tensor of the same type and shape as `image`.
A 3-D tensor of the same type and shape as `label`.
"""
uniform_random = tf.random_uniform([], 0, 1.0)
mirror_cond = tf.less(uniform_random, .5)
image = tf.cond(mirror_cond, lambda: tf.reverse(image, [1]), lambda: image)
label = tf.cond(mirror_cond, lambda: tf.reverse(label, [1]), lambda: label)
return image, label
def eval_input_fn(image_filenames, label_filenames=None, batch_size=1):
"""An input function for evaluation and inference.
Args:
image_filenames: The file names for the inferred images.
label_filenames: The file names for the grand truth labels.
batch_size: The number of samples per batch. Need to be 1
for the images of different sizes.
Returns:
A tuple of images and labels.
"""
# Reads an image from a file, decodes it into a dense tensor
def _parse_function(filename, is_label):
if not is_label:
image_filename, label_filename = filename, None
else:
image_filename, label_filename = filename
image_string = tf.read_file(image_filename)
image = tf.image.decode_image(image_string)
image = tf.to_float(tf.image.convert_image_dtype(image, dtype=tf.uint8))
image.set_shape([None, None, 3])
image = mean_image_subtraction(image)
if not is_label:
return image
else:
label_string = tf.read_file(label_filename)
label = tf.image.decode_image(label_string)
label = tf.to_int32(tf.image.convert_image_dtype(label, dtype=tf.uint8))
label.set_shape([None, None, 1])
return image, label
if label_filenames is None:
input_filenames = image_filenames
else:
input_filenames = (image_filenames, label_filenames)
dataset = tf.data.Dataset.from_tensor_slices(input_filenames)
if label_filenames is None:
dataset = dataset.map(lambda x: _parse_function(x, False))
else:
dataset = dataset.map(lambda x, y: _parse_function((x, y), True))
dataset = dataset.prefetch(batch_size)
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
if label_filenames is None:
images = iterator.get_next()
labels = None
else:
images, labels = iterator.get_next()
return images, labels
2018-06-24 15:09:19.543117: I tensorflow/core/common_runtime/bfc_allocator.cc:680] Stats:
Limit: 11924940391
InUse: 5998758912
MaxInUse: 8327384064
NumAllocs: 7641
MaxAllocSize: 4246732800
2018-06-24 15:09:19.543188: W tensorflow/core/common_runtime/bfc_allocator.cc:279] ****_******______**___________________******************************************____________________
2018-06-24 15:09:19.543229: W tensorflow/core/framework/op_kernel.cc:1318] OP_REQUIRES failed at transpose_op.cc:199 : Resource exhausted: OOM when allocating tensor with shape[576,2048,22,25] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
Traceback (most recent call last):
File "evaluate.py", line 149, in <module>
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "evaluate.py", line 86, in main
preds = sess.run(predictions)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 900, in run
run_metadata_ptr)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1135, in _run
feed_dict_tensor, options, run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1316, in _do_run
run_metadata)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.py", line 1335, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.ResourceExhaustedError: OOM when allocating tensor with shape[1296,16,17,2048] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: aspp/conv_3x3_3/SpaceToBatchND = SpaceToBatchND[T=DT_FLOAT, Tblock_shape=DT_INT32, Tpaddings=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](resnet_v2_101/block4/unit_3/bottleneck_v2/add-2-0-TransposeNCHWToNHWC-LayoutOptimizer, aspp/conv_3x3_3/SpaceToBatchND/block_shape, aspp/conv_3x3_3/strided_slice_2)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[Node: confusion_matrix/assert_non_negative_1/assert_less_equal/Assert/AssertGuard/Assert/Switch_1/_1375 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3800_...t/Switch_1", tensor_type=DT_INT64, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
Caused by op u'aspp/conv_3x3_3/SpaceToBatchND', defined at:
File "evaluate.py", line 149, in <module>
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 126, in run
_sys.exit(main(argv))
File "evaluate.py", line 71, in main
'freeze_batch_norm': True
File "/home/dmarmanis/deep_learning/semantic_segmentation/models/DeepLab3/deeplab/deeplab_model.py", line 172, in deeplabv3_plus_model_fn
logits = network(features, mode == tf.estimator.ModeKeys.TRAIN)
File "/home/dmarmanis/deep_learning/semantic_segmentation/models/DeepLab3/deeplab/deeplab_model.py", line 133, in model
encoder_output = atrous_spatial_pyramid_pooling(net, output_stride, batch_norm_decay, is_training)
File "/home/dmarmanis/deep_learning/semantic_segmentation/models/DeepLab3/deeplab/deeplab_model.py", line 51, in atrous_spatial_pyramid_pooling
conv_3x3_3 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[2], scope='conv_3x3_3')
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 183, in func_with_args
return func(*args, **current_args)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1049, in convolution
outputs = layer.apply(inputs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 828, in apply
return self.__call__(inputs, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/base.py", line 717, in __call__
outputs = self.call(inputs, *args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/layers/convolutional.py", line 168, in call
outputs = self._convolution_op(inputs, self.kernel)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 868, in __call__
return self.conv_op(inp, filter)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 520, in __call__
return self.call(inp, filter)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/nn_ops.py", line 503, in _with_space_to_batch_call
input=inp, block_shape=dilation_rate, paddings=paddings)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_array_ops.py", line 7486, in space_to_batch_nd
paddings=paddings, name=name)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
op_def=op_def)
File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[1296,16,17,2048] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
[[Node: aspp/conv_3x3_3/SpaceToBatchND = SpaceToBatchND[T=DT_FLOAT, Tblock_shape=DT_INT32, Tpaddings=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"](resnet_v2_101/block4/unit_3/bottleneck_v2/add-2-0-TransposeNCHWToNHWC-LayoutOptimizer, aspp/conv_3x3_3/SpaceToBatchND/block_shape, aspp/conv_3x3_3/strided_slice_2)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
[[Node: confusion_matrix/assert_non_negative_1/assert_less_equal/Assert/AssertGuard/Assert/Switch_1/_1375 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3800_...t/Switch_1", tensor_type=DT_INT64, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.