我正在训练一个神经网络来分割体积医学图像。每个图像的大小为192x192x160
(一个通道),我按大小为2的批处理大小来训练它们。我的问题是,在训练期间我随机得到ResourceExhaustedError(有时在4个时期之后,有时在10个时期之后,有时它只是没有不会引发任何错误)。我在Ubuntu 16.04.5 LTS(Xenial Xerus)上将Keras 2.1.1与tensorflow 1.3.0一起使用。该代码在GPU nVidia GeForce GTX 1080ti上运行。
在下面的代码中,train_images是一个大小为100x192x192x160
的numpy数组,train_masks是一个大小为100x192x192x160x3
的numpy数组,val_images是一个大小为30x192x192x160
的numpy数组和val_masks的大小30x192x192x160x3
。
如何确保该程序不会因ResourceExhaustedError而停止?不要犹豫,问您是否认为任何其他信息可以帮助您!
在开始培训之前,我尝试通过nvidia-smi命令行杀死我GPU上的所有PID进程。
history = model.fit(train_images,
train_masks,
batch_size=params['batch_size'],
epochs=params['epochs'],
verbose=1,
shuffle=True,
validation_data = (val_images, val_masks))
ResourceExhaustedError Traceback (most recent call last)
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1326 try:
-> 1327 return fn(*args)
1328 except errors.OpError as e:
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
1305 feed_dict, fetch_list, target_list,
-> 1306 status, run_metadata)
1307
/export/share/anaconda3/lib/python3.6/contextlib.py in __exit__(self, type, value, traceback)
87 try:
---> 88 next(self.gen)
89 except StopIteration:
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in raise_exception_on_not_ok_status()
465 compat.as_text(pywrap_tensorflow.TF_Message(status)),
--> 466 pywrap_tensorflow.TF_GetCode(status))
467 finally:
ResourceExhaustedError: OOM when allocating tensor with shape[2,32,192,192,160]
[[Node: training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2 = Conv3DBackpropInputV2[T=DT_FLOAT, _class=["loc:@conv3d_17/convolution"], data_format="NDHWC", padding="SAME", strides=[1, 1, 1, 1, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/conv3d_17/convolution_grad/Shape, conv3d_17/kernel/read, training/Adam/gradients/conv3d_17/add_grad/Reshape)]]
During handling of the above exception, another exception occurred:
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-9-6fa9876b44a1> in <module>()
----> 1 train(images[0:params['n_images'],:,:,:], masks[0:params['n_images'],:,:,:,:], images[100:130,:,:,:], masks[100:130,:,:,:,:], params)
<ipython-input-7-6118d16d7eba> in train(train_images, train_masks, val_images, val_masks, params)
78 verbose=1,
79 shuffle=True,
---> 80 validation_data = (val_images, val_masks))
81 #callbacks=[model_checkpoint, earlystopping])
82 model.save('results/' + params2name(params) + '/weights.h5')
/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1648 initial_epoch=initial_epoch,
1649 steps_per_epoch=steps_per_epoch,
-> 1650 validation_steps=validation_steps)
1651
1652 def evaluate(self, x=None, y=None,
/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
1211 batch_logs['size'] = len(batch_ids)
1212 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1213 outs = f(ins_batch)
1214 if not isinstance(outs, list):
1215 outs = [outs]
/export/share/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
2350 session = get_session()
2351 updated = session.run(fetches=fetches, feed_dict=feed_dict,
-> 2352 **self.session_kwargs)
2353 return updated[:len(self.outputs)]
2354
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata)
893 try:
894 result = self._run(None, fetches, feed_dict, options_ptr,
--> 895 run_metadata_ptr)
896 if run_metadata:
897 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1122 if final_fetches or final_targets or (handle and feed_dict_tensor):
1123 results = self._do_run(handle, final_targets, final_fetches,
-> 1124 feed_dict_tensor, options, run_metadata)
1125 else:
1126 results = []
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
1319 if handle is None:
1320 return self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1321 options, run_metadata)
1322 else:
1323 return self._do_call(_prun_fn, self._session, handle, feeds, fetches)
/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args)
1338 except KeyError:
1339 pass
-> 1340 raise type(e)(node_def, op, message)
1341
1342 def _extend_graph(self):
ResourceExhaustedError: OOM when allocating tensor with shape[2,32,192,192,160]
[[Node: training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2 = Conv3DBackpropInputV2[T=DT_FLOAT, _class=["loc:@conv3d_17/convolution"], data_format="NDHWC", padding="SAME", strides=[1, 1, 1, 1, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/conv3d_17/convolution_grad/Shape, conv3d_17/kernel/read, training/Adam/gradients/conv3d_17/add_grad/Reshape)]]
Caused by op 'training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2', defined at:
File "/export/share/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/export/share/anaconda3/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "/export/share/anaconda3/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
app.start()
File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
ioloop.IOLoop.instance().start()
File "/export/share/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
handler_func(fd_obj, events)
File "/export/share/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/export/share/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
self._handle_recv()
File "/export/share/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
self._run_callback(callback, msg)
File "/export/share/anaconda3/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
callback(*args, **kwargs)
File "/export/share/anaconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
return fn(*args, **kwargs)
File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
return self.dispatch_shell(stream, msg)
File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
handler(stream, idents, msg)
File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
user_expressions, allow_stdin)
File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "/export/share/anaconda3/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
File "/export/share/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
interactivity=interactivity, compiler=compiler, result=result)
File "/export/share/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2808, in run_ast_nodes
if self.run_code(code, result):
File "/export/share/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "<ipython-input-9-6fa9876b44a1>", line 1, in <module>
train(images[0:params['n_images'],:,:,:], masks[0:params['n_images'],:,:,:,:], images[100:130,:,:,:], masks[100:130,:,:,:,:], params)
File "<ipython-input-7-6118d16d7eba>", line 80, in train
validation_data = (val_images, val_masks))
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1627, in fit
self._make_train_function()
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 990, in _make_train_function
loss=self.total_loss)
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 87, in wrapper
return func(*args, **kwargs)
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/optimizers.py", line 415, in get_updates
grads = self.get_gradients(loss, params)
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/optimizers.py", line 73, in get_gradients
grads = K.gradients(loss, params)
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 2389, in gradients
return tf.gradients(loss, variables, colocate_gradients_with_ops=True)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in gradients
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 348, in _MaybeCompile
return grad_fn() # Exit early
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gradients_impl.py", line 542, in <lambda>
grad_scope, op, func_call, lambda: grad_fn(op, *out_grads))
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_grad.py", line 80, in _Conv3DGrad
data_format=data_format),
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 664, in conv3d_backprop_input_v2
name=name)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
...which was originally created as op 'conv3d_17/convolution', defined at:
File "/export/share/anaconda3/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
[elided 18 identical lines from previous traceback]
File "<ipython-input-9-6fa9876b44a1>", line 1, in <module>
train(images[0:params['n_images'],:,:,:], masks[0:params['n_images'],:,:,:,:], images[100:130,:,:,:], masks[100:130,:,:,:,:], params)
File "<ipython-input-7-6118d16d7eba>", line 54, in train
model = unet(params,imsz)
File "/DATA/jeaneliott/bladderectum/utils.py", line 115, in unet
conv9 = Conv3D(params['n_feat_maps'], (3, 3, 3), activation='relu', padding='same')(up9)
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/engine/topology.py", line 603, in __call__
output = self.call(inputs, **kwargs)
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/layers/convolutional.py", line 172, in call
dilation_rate=self.dilation_rate)
File "/export/share/anaconda3/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 3364, in conv3d
data_format=tf_data_format)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 672, in convolution
op=op)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 338, in with_space_to_batch
return op(input, num_spatial_dims, padding)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 664, in op
name=name)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/nn_ops.py", line 146, in _non_atrous_convolution
name=name)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/gen_nn_ops.py", line 524, in conv3d
data_format=data_format, name=name)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
op_def=op_def)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2630, in create_op
original_op=self._default_original_op, op_def=op_def)
File "/export/share/anaconda3/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1204, in __init__
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[2,32,192,192,160]
[[Node: training/Adam/gradients/conv3d_17/convolution_grad/Conv3DBackpropInputV2 = Conv3DBackpropInputV2[T=DT_FLOAT, _class=["loc:@conv3d_17/convolution"], data_format="NDHWC", padding="SAME", strides=[1, 1, 1, 1, 1], _device="/job:localhost/replica:0/task:0/gpu:0"](training/Adam/gradients/conv3d_17/convolution_grad/Shape, conv3d_17/kernel/read, training/Adam/gradients/conv3d_17/add_grad/Reshape)]]