colab上的mxnet-gluon:cudaMalloc重试失败:内存不足

时间:2020-04-20 06:53:55

标签: google-colaboratory mxnet gluon faster-rcnn

我正在尝试使用mxnet gluon和VOC数据集在google-colab上训练更快的RCNN。 训练网络后,当我尝试分析新图像进行测试时,出现以下错误。 无论时期数,批数限制或大小如何,都会出现此问题。

This is the example I'm following

This is a full copy of what I've done so far

这是培训师

trainer = gluon.Trainer(net.collect_params(), 'sgd',{'learning_rate': 15, 'wd': 0.05, 'momentum': 0.9})

这是训练周期

#net.hybridize()
for epoch in range(epochs):
    with Timer("epochTime"):
        print("epoch = ", epoch, "----------------------------------------------------")
        for ib, batch in enumerate(train_loader):
            if ib > batchesLimit:
                break
            for dataa, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip(*batch):

                dataa = dataa.as_in_context(mx.gpu(0))
                label = label.as_in_context(mx.gpu(0)).expand_dims(0)
                rpn_cls_targets = rpn_cls_targets.as_in_context(mx.gpu(0))
                rpn_box_targets = rpn_box_targets.as_in_context(mx.gpu(0))
                rpn_box_masks = rpn_box_masks.as_in_context(mx.gpu(0))

                gt_label = label[:, :, 4:5]
                gt_box = label[:, :, :4]

                with autograd.record():
                    # network forward
                    cls_preds, box_preds, roi, samples, matches, rpn_score, rpn_box, anchors, cls_targets, box_targets, box_masks, _ = net(dataa.expand_dims(0), gt_box, gt_label)

                    # losses of rpn (region proposal network)
                    rpn_score = rpn_score.squeeze(axis=-1)
                    num_rpn_pos = (rpn_cls_targets >= 0).sum()
                    rpn_loss1 = rpn_cls_loss(rpn_score, rpn_cls_targets,rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos
                    rpn_loss2 = rpn_box_loss(rpn_box, rpn_box_targets,rpn_box_masks) * rpn_box.size / num_rpn_pos

                    # losses of rcnn (region convolutional neuronal network)
                    num_rcnn_pos = (cls_targets >= 0).sum()
                    rcnn_loss1 = rcnn_cls_loss(cls_preds, cls_targets,cls_targets >= 0) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos
                    rcnn_loss2 = rcnn_box_loss(box_preds, box_targets, box_masks) * box_preds.size / box_preds.shape[0] / num_rcnn_pos

                # Backpropagation y actualizacion
                autograd.backward([rpn_loss1, rpn_loss2, rcnn_loss1, rcnn_loss2])
                trainer.set_learning_rate(100/((epoch+1)**(4/3)))
                trainer.step(batch_size)

这就是我加载要测试的图像的方式(我已经在另一台笔记本电脑上测试了该单元格,并使用了该单元格的预训练版本,一切正常)

# DEFINIMOS LA TRANSFORMADA
short, max_size = 600, 800  # resize image to short side 600 px, but keep maximum length within 1000
RCNN_transform = presets.rcnn.FasterRCNNDefaultTrainTransform(short, max_size)

# SE IMPORTA UNA IMAGEN 100% NUEVA
miImg = 'gdrive/My Drive/Practica J PABLO FUENZALIDA/bar.jpg'
x, img = data.transforms.presets.rcnn.load_test(miImg)
x=x.as_in_context(mx.gpu(0))
img = mx.nd.array(img,ctx=ctx)

box_ids, scores, bboxes = net(x)

ax = utils.viz.plot_bbox(img, bboxes[0], scores[0], box_ids[0], thresh=0.3, class_names=net.classes)
print(bboxes[0])
print(scores[0])

plt.show()

我应该如何面对这个错误?我应该采用什么好的做法?

---------------------------------------------------------------------------
MXNetError                                Traceback (most recent call last)
<ipython-input-23-bfcca7cbcfca> in <module>()
     10 box_ids, scores, bboxes = net(x)
     11 
---> 12 ax = utils.viz.plot_bbox(img, bboxes[0], scores[0], box_ids[0], thresh=0.3, class_names=net.classes)
     13 
     14 plt.show()

2 frames
/usr/local/lib/python3.6/dist-packages/gluoncv/utils/viz/bbox.py in plot_bbox(img, bboxes, scores, labels, thresh, class_names, colors, ax, reverse_rgb, absolute_coordinates)
     59 
     60     if isinstance(bboxes, mx.nd.NDArray):
---> 61         bboxes = bboxes.asnumpy()
     62     if isinstance(labels, mx.nd.NDArray):
     63         labels = labels.asnumpy()

/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in asnumpy(self)
   2533             self.handle,
   2534             data.ctypes.data_as(ctypes.c_void_p),
-> 2535             ctypes.c_size_t(data.size)))
   2536         return data
   2537 

/usr/local/lib/python3.6/dist-packages/mxnet/base.py in check_call(ret)
    253     """
    254     if ret != 0:
--> 255         raise MXNetError(py_str(_LIB.MXGetLastError()))
    256 
    257 

MXNetError: [18:30:25] src/storage/./pooled_storage_manager.h:161: cudaMalloc retry failed: out of memory
Stack trace:
  [bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x6d554b) [0x7ff0fc97e54b]
  [bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a0c72) [0x7ff100449c72]
  [bt] (2) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x41a694f) [0x7ff10044f94f]
  [bt] (3) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x3972e10) [0x7ff0ffc1be10]
  [bt] (4) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x39730c7) [0x7ff0ffc1c0c7]
  [bt] (5) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(mxnet::imperative::PushFCompute(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<unsigned int, std::allocator<unsigned int> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x281) [0x7ff0ffc1c4d1]
  [bt] (6) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38970cb) [0x7ff0ffb400cb]
  [bt] (7) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a3c31) [0x7ff0ffb4cc31]
  [bt] (8) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x38a7170) [0x7ff0ffb50170]

0 个答案:

没有答案