我正在使用AWS p2.x8large并尝试使用k-fold交叉验证来评估我的模型。 第一次重复后,我的GPU内存已满,当我再次尝试训练时,我收到了一个cuda内存问题。
我的问题是如何在循环中重置GPU内存?我使用了K.clear_session()和gc.collect(),但没有一个工作。
错误消息:
> MXNetError Traceback (most recent call
> last) ~/anaconda3/lib/python3.6/site-packages/mxnet/symbol.py in
> simple_bind(self, ctx, grad_req, type_dict, group2ctx,
> shared_arg_names, shared_exec, shared_buffer, **kwargs) 1472
> shared_exec_handle,
> -> 1473 ctypes.byref(exe_handle))) 1474 except MXNetError as e:
>
> ~/anaconda3/lib/python3.6/site-packages/mxnet/base.py in
> check_call(ret)
> 128 if ret != 0:
> --> 129 raise MXNetError(py_str(_LIB.MXGetLastError()))
> 130
>
> MXNetError: [19:24:04] src/storage/./pooled_storage_manager.h:102:
> cudaMalloc failed: out of memory
>
> Stack trace returned 10 entries: [bt] (0)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1d57cc)
> [0x7f55ce9fe7cc] [bt] (1)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1242238)
> [0x7f55cfa6b238] [bt] (2)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1244c0a)
> [0x7f55cfa6dc0a] [bt] (3)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe4d4db)
> [0x7f55cf6764db] [bt] (4)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe549cd)
> [0x7f55cf67d9cd] [bt] (5)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe59f95)
> [0x7f55cf682f95] [bt] (6)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5d6ee)
> [0x7f55cf6866ee] [bt] (7)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5dcd4)
> [0x7f55cf686cd4] [bt] (8)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2261)
> [0x7f55cf605291] [bt] (9)
> /home/ubuntu/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c)
> [0x7f560d6c4ec0]
>
>
> During handling of the above exception, another exception occurred:
>
> RuntimeError Traceback (most recent call
> last) <ipython-input-4-0720b69f15af> in <module>()
> 33 if val_batches.n>0:
> 34 hist = model.fit_generator(generator=train_gen, samples_per_epoch=batches.n,
> ---> 35 nb_epoch=epochs, verbose=True, validation_data=val_gen, nb_val_samples=val_batches.n,
> callbacks=callbacks)
> 36 else:
> 37 model.fit_generator(generator=train_gen, samples_per_epoch=batches.n,
>
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py
> in fit_generator(self, generator, samples_per_epoch, nb_epoch,
> verbose, callbacks, validation_data, nb_val_samples, class_weight,
> max_q_size, nb_worker, pickle_safe, initial_epoch) 1557
> outs = self.train_on_batch(x, y, 1558
> sample_weight=sample_weight,
> -> 1559 class_weight=class_weight) 1560 1561 if not
> isinstance(outs, list):
>
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py
> in train_on_batch(self, x, y, sample_weight, class_weight) 1320
> ins = x + y + sample_weights 1321
> self._make_train_function()
> -> 1322 outputs = self.train_function(ins) 1323 if len(outputs) == 1: 1324 return outputs[0]
>
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py
> in train_function(inputs) 1952 def
> _make_train_function(self): 1953 def train_function(inputs):
> -> 1954 data, label, _, data_shapes, label_shapes = self._adjust_module(inputs, 'train') 1955 1956
> batch = K.mx.io.DataBatch(data=data, label=label, bucket_key='train',
>
> ~/anaconda3/lib/python3.6/site-packages/Keras-1.2.2-py3.6.egg/keras/engine/training.py
> in _adjust_module(self, inputs, phase) 1908 if not
> self._mod.binded: 1909
> self._mod.bind(data_shapes=data_shapes, label_shapes=None,
> -> 1910 for_training=True) 1911 self._set_weights() 1912
> self._mod.init_optimizer(kvstore=self._kvstore,
> optimizer=self.optimizer)
>
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/bucketing_module.py
> in bind(self, data_shapes, label_shapes, for_training,
> inputs_need_grad, force_rebind, shared_module, grad_req)
> 322 state_names=self._state_names)
> 323 module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
> --> 324 force_rebind=False, shared_module=None, grad_req=grad_req)
> 325 self._curr_module = module
> 326 self._curr_bucket_key = self._default_bucket_key
>
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/module.py in
> bind(self, data_shapes, label_shapes, for_training, inputs_need_grad,
> force_rebind, shared_module, grad_req)
> 415 fixed_param_names=self._fixed_param_names,
> 416 grad_req=grad_req,
> --> 417 state_names=self._state_names)
> 418 self._total_exec_bytes = self._exec_group._total_exec_bytes
> 419 if shared_module is not None:
>
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py
> in __init__(self, symbol, contexts, workload, data_shapes,
> label_shapes, param_names, for_training, inputs_need_grad,
> shared_group, logger, fixed_param_names, grad_req, state_names)
> 229 self.num_outputs = len(self.symbol.list_outputs())
> 230
> --> 231 self.bind_exec(data_shapes, label_shapes, shared_group)
> 232
> 233 def decide_slices(self, data_shapes):
>
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py
> in bind_exec(self, data_shapes, label_shapes, shared_group, reshape)
> 325 else:
> 326 self.execs.append(self._bind_ith_exec(i, data_shapes_i, label_shapes_i,
> --> 327 shared_group))
> 328
> 329 self.data_shapes = data_shapes
>
> ~/anaconda3/lib/python3.6/site-packages/mxnet/module/executor_group.py
> in _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group)
> 601 type_dict=input_types, shared_arg_names=self.param_names,
> 602 shared_exec=shared_exec,
> --> 603 shared_buffer=shared_data_arrays, **input_shapes)
> 604 self._total_exec_bytes += int(executor.debug_str().split('\n')[-3].split()[1])
> 605 return executor
>
> ~/anaconda3/lib/python3.6/site-packages/mxnet/symbol.py in
> simple_bind(self, ctx, grad_req, type_dict, group2ctx,
> shared_arg_names, shared_exec, shared_buffer, **kwargs) 1477
> error_msg += "%s: %s\n" % (k, v) 1478 error_msg += "%s"
> % e
> -> 1479 raise RuntimeError(error_msg) 1480 1481 # update shared_buffer
>
> RuntimeError: simple_bind error. Arguments: input_1_1: (64, 3, 224,
> 224) [19:24:04] src/storage/./pooled_storage_manager.h:102: cudaMalloc
> failed: out of memory
>
> Stack trace returned 10 entries: [bt] (0)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1d57cc)
> [0x7f55ce9fe7cc] [bt] (1)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1242238)
> [0x7f55cfa6b238] [bt] (2)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x1244c0a)
> [0x7f55cfa6dc0a] [bt] (3)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe4d4db)
> [0x7f55cf6764db] [bt] (4)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe549cd)
> [0x7f55cf67d9cd] [bt] (5)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe59f95)
> [0x7f55cf682f95] [bt] (6)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5d6ee)
> [0x7f55cf6866ee] [bt] (7)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(+0xe5dcd4)
> [0x7f55cf686cd4] [bt] (8)
> /home/ubuntu/anaconda3/lib/python3.6/site-packages/mxnet/libmxnet.so(MXExecutorSimpleBind+0x2261)
> [0x7f55cf605291] [bt] (9)
> /home/ubuntu/anaconda3/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c)
> [0x7f560d6c4ec0]
答案 0 :(得分:1)
使用gc.collect
我能够将GPU内存占用限制为单次运行的两倍。没有它,内存占用量不断增加。我为培训和评估创建了一个函数,允许gc
在返回结果后清除model
。
cv_results = []
for train, test in cv_folds:
result = train_and_eval(train, test)
cv_results.append(result)
gc.collect()
由于占地面积仍然是两倍,因此您应该考虑减小批量大小来弥补这一点。然后,您应该能够将所有内容都安装到GPU内存中。
值得注意的是,MXNet实际上并没有从GPU中释放内存,而是将其添加回内部内存池以供将来使用。因此,尽管在nvidia-smi
中GPU内存使用率仍然很高,但内存仍可用于MXNet。与大多数计算步骤一样,此内存的垃圾收集是异步发生的。
如果你不能在你的GPU上保留两次运行内存,你总是可以使用类似的东西来启动geoalgo所提到的子进程;
from subprocess import Popen, PIPE, STDOUT
import json
def eval_on_fold(indicies):
indicies_str = json.dumps(indicies)
p = Popen(['python', 'train_and_eval.py', '--cv-indicies'], stdout=PIPE, stdin=PIPE, stderr=PIPE)
eval_metric_str = p.communicate(input=indicies_str)[0]
eval_metric = float(eval_metric_str)
return eval_metric
cv_results = []
for train, test in cv_folds:
indicies = {'train': train, 'test': test}
eval_metric = eval_on_fold(indicies)
cv_results.append(eval_metric)