我正在尝试在自定义数据集上训练GluonCV的FasterRCNN。但是,在将数据加载到模型期间,我遇到了一些错误,这使我无法训练模型。
我在[1]中改编了GluonCV教程的完整培训脚本,并参考了[2]中针对SSD的完整培训脚本。我从[3]中的训练脚本中成功地改编了YoloV3的示例,并成功地训练了SSD和YoloV3。这些脚本是为多GPU设计的,因此我简化了Jupyter Notebook中所有3个脚本的单GPU执行脚本。
但是,当我使用本教程中的Batchify函数时,Faster RCNN失败了:
train_bfn = batchify.Tuple(*[batchify.Append() for _ in range(5)])
我试图寻找另一种方法,并在GluonCV的存储库中遇到了另一个批处理功能,但是它也失败了
train_bfn = batchify.FasterRCNNTrainBatchify(net)
[1] https://gluon-cv.mxnet.io/build/examples_detection/finetune_detection.html
[2] https://gluon-cv.mxnet.io/build/examples_detection/train_faster_rcnn_voc.html#data-loader
[3] https://gluon-cv.mxnet.io/build/examples_detection/train_yolo_v3.html
数据加载器脚本如下:
def get_dataloader(net, train_dataset, batch_size, num_workers):
train_bfn = batchify.Tuple(*[batchify.Append() for _ in range(5)])
#train_bfn = batchify.FasterRCNNTrainBatchify(net) #the train_bfn discovered from the gluoncv repos which ba
#adding and removing train sampler didn't fix the loading problem
#train_sampler = gcv.nn.sampler.SplitSampler(len(train_dataset),1)
train_loader = mx.gluon.data.DataLoader(
train_dataset.transform(FasterRCNNDefaultTrainTransform(net.short, net.max_size, net, ashape=net.ashape, multi_stage=True)),
batch_size, False, batchify_fn=train_bfn,
last_batch='rollover', num_workers=num_workers)
return train_loader
下面定义了主要的训练循环。除了删除了脚本参数和多GPU训练之外,它大致遵循原始教程:
#unusual loading method for Faster-RCNN
def split_and_load(batch, ctx_list):
"""Split data to 1 batch each device."""
num_ctx = len(ctx_list)
new_batch = []
for i, data in enumerate(batch):
new_data = [x.as_in_context(ctx) for x, ctx in zip(data, ctx_list)]
new_batch.append(new_data)
return new_batch
print(f"Starting Training @ {start_epoch+1}/{end_epoch} Epochs. Reporting loss every {batch_reporter} batch")
start_time = time.time()
for epoch in range(start_epoch, end_epoch):
rcnn_task = ForwardBackwardTask(net, trainer, rpn_cls_loss, rpn_box_loss, rcnn_cls_loss, rcnn_box_loss)
executor = Parallel(1, rcnn_task)
#metrics
for metric in rcnn_losses:
metric.reset()
tic = time.time()
btic = time.time()
#timers
tic = time.time() #currently unused
btic = time.time() #batch time
#setup the network into static computation graph for faster computation
net.hybridize(static_alloc=True, static_shape=True)
#main training loop, batch
for i, batch in enumerate(train_batcher):
#get batch size
batch = split_and_load(batch, ctx_list=ctx)
batch_size = len(batch[0])
batch_losses = [[] for _ in rcnn_losses]
batch_metrics = [[] for _ in rcnn_metrics]
#load the gpu context into the data, class targets and box targets
for data in zip(*batch):
executor.put(data)
#iteratte over the contexts
for j in range(len(ctx)):
result = executor.get()
for k in range(len(metric_losses)):
batch_losses[k].append(result[k])
for k in range(len(add_losses)):
batch_mertrics[k].append(result[len(batch_losses) + k])
for metric, record in zip(rcnn_losses, batch_losses):
metric.update(0, record)
for metric, records in zip(rcnn_metrics, batch_metrics):
for pred in records:
metric.update(pred[0], pred[1])
#update traininer
trainer.step(batch_size)
if i % batch_reporter == 0:
msg = ','.join(
['{}={:.3f}'.format(*metric.get()) for metric in rcnn_losses + rcnn_metrics])
logger.info('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'.format(
epoch, i, batch_size / (time.time() - btic), msg))
btic = time.time()
使用以下批处理功能时,下面列出了全部错误:
train_bfn = batchify.Tuple(*[batchify.Append() for _ in range(5)])
Starting Training @ 1/100 Epochs. Reporting loss every 20 batch
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in array(source_array, ctx, dtype)
2500 try:
-> 2501 source_array = np.array(source_array, dtype=dtype)
2502 except:
ValueError: setting an array element with a sequence.
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
7 frames
<ipython-input-17-7873cb0811cc> in <module>()
50 net.hybridize(static_alloc=True, static_shape=True)
51 #main training loop, batch
---> 52 for i, batch in enumerate(train_batcher):
53 #get batch size
54 batch = split_and_load(batch, ctx_list=ctx)
/usr/local/lib/python3.6/dist-packages/mxnet/gluon/data/dataloader.py in same_process_iter()
573 def same_process_iter():
574 for batch in self._batch_sampler:
--> 575 ret = self._batchify_fn([self._dataset[idx] for idx in batch])
576 if self._pin_memory:
577 ret = _as_in_context(ret, context.cpu_pinned(self._pin_device_id))
/usr/local/lib/python3.6/dist-packages/gluoncv/data/batchify.py in __call__(self, data)
378 ret = []
379 for i, ele_fn in enumerate(self._fn):
--> 380 ret.append(ele_fn([ele[i] for ele in data]))
381 return tuple(ret)
382
/usr/local/lib/python3.6/dist-packages/gluoncv/data/batchify.py in __call__(self, data)
294 """
295 return _append_arrs(data, use_shared_mem=True,
--> 296 expand=self._expand, batch_axis=self._batch_axis)
297
298
/usr/local/lib/python3.6/dist-packages/gluoncv/data/batchify.py in _append_arrs(arrs, use_shared_mem, expand, batch_axis)
92 else:
93 if use_shared_mem:
---> 94 out = [mx.nd.array(x, ctx=mx.Context('cpu_shared', 0)) for x in arrs]
95 else:
96 out = [mx.nd.array(x) for x in arrs]
/usr/local/lib/python3.6/dist-packages/gluoncv/data/batchify.py in <listcomp>(.0)
92 else:
93 if use_shared_mem:
---> 94 out = [mx.nd.array(x, ctx=mx.Context('cpu_shared', 0)) for x in arrs]
95 else:
96 out = [mx.nd.array(x) for x in arrs]
/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/utils.py in array(source_array, ctx, dtype)
144 return _sparse_array(source_array, ctx=ctx, dtype=dtype)
145 else:
--> 146 return _array(source_array, ctx=ctx, dtype=dtype)
147
148
/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in array(source_array, ctx, dtype)
2501 source_array = np.array(source_array, dtype=dtype)
2502 except:
-> 2503 raise TypeError('source_array must be array like object')
2504 arr = empty(source_array.shape, ctx, dtype)
2505 arr[:] = source_array
TypeError: source_array must be array like object
使用时出现以下完整错误:
train_bfn = batchify.FasterRCNNTrainBatchify(net)
Starting Training @ 1/100 Epochs. Reporting loss every 20 batch
---------------------------------------------------------------------------
MXNetError Traceback (most recent call last)
<ipython-input-48-7873cb0811cc> in <module>()
58 #load the gpu context into the data, class targets and box targets
59 for data in zip(*batch):
---> 60 executor.put(data)
61 #iteratte over the contexts
62 for j in range(len(ctx)):
6 frames
/usr/local/lib/python3.6/dist-packages/gluoncv/utils/parallel.py in put(self, x)
117 if self._num_serial > 0 or len(self._threads) == 0:
118 self._num_serial -= 1
--> 119 out = self._parallizable.forward_backward(x)
120 self._out_queue.put(out)
121 else:
<ipython-input-47-9cc4d320d600> in forward_backward(self, x)
17
18 with autograd.record():
---> 19 gt_label = label[:, :, 4:5]
20 gt_box = label[:, :, :4]
21 cls_pred, box_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net(
/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in __getitem__(self, key)
509 indexing_dispatch_code = _get_indexing_dispatch_code(key)
510 if indexing_dispatch_code == _NDARRAY_BASIC_INDEXING:
--> 511 return self._get_nd_basic_indexing(key)
512 elif indexing_dispatch_code == _NDARRAY_ADVANCED_INDEXING:
513 return self._get_nd_advanced_indexing(key)
/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/ndarray.py in _get_nd_basic_indexing(self, key)
821 'index=%s of type=%s.' % (str(slice_i), str(type(slice_i))))
822 kept_axes.extend(range(i+1, len(shape)))
--> 823 sliced_nd = op.slice(self, begin, end, step)
824 if len(kept_axes) == len(shape):
825 return sliced_nd
/usr/local/lib/python3.6/dist-packages/mxnet/ndarray/register.py in slice(data, begin, end, step, out, name, **kwargs)
/usr/local/lib/python3.6/dist-packages/mxnet/_ctypes/ndarray.py in _imperative_invoke(handle, ndargs, keys, vals, out)
90 c_str_array(keys),
91 c_str_array([str(s) for s in vals]),
---> 92 ctypes.byref(out_stypes)))
93
94 if original_output is not None:
/usr/local/lib/python3.6/dist-packages/mxnet/base.py in check_call(ret)
251 """
252 if ret != 0:
--> 253 raise MXNetError(py_str(_LIB.MXGetLastError()))
254
255
MXNetError: [10:50:45] src/operator/tensor/./matrix_op-inl.h:657: Check failed: param_begin.ndim() <= dshape.ndim() (3 vs. 2) : Slicing axis exceeds data dimensions
Stack trace:
[bt] (0) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x4a357b) [0x7f5fce81657b]
[bt] (1) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x23fd387) [0x7f5fd0770387]
[bt] (2) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x2402f96) [0x7f5fd0775f96]
[bt] (3) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(mxnet::imperative::SetShapeType(mxnet::Context const&, nnvm::NodeAttrs const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, mxnet::DispatchMode*)+0x1fb1) [0x7f5fd0a6acc1]
[bt] (4) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(mxnet::Imperative::Invoke(mxnet::Context const&, nnvm::NodeAttrs const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&)+0x1db) [0x7f5fd0a7493b]
[bt] (5) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(+0x25ffd99) [0x7f5fd0972d99]
[bt] (6) /usr/local/lib/python3.6/dist-packages/mxnet/libmxnet.so(MXImperativeInvokeEx+0x6f) [0x7f5fd097338f]
[bt] (7) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f601f94edae]
[bt] (8) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f601f94e71f]