Question

我正在使用DataLoader从基于numpy memmap的自定义Dataset对象读取。只要我不拖尾地读取数据，一切就可以正常工作，但是，根据我设置的shuffle=True，运行时会崩溃。

我尝试通过使用置换向量并在DataLoader中设置shuffle=False在Dataset类中实现改组机制，但问题仍然存在。我还注意到，在改组时，Dataset对象的__getitem__()函数被调用了n次，其中n是batch_size。

这是数据集代码：

class CustomDataset(Dataset):
  num_pattern = 60112
  base_folder = 'dataset'

  def __init__(self, root):
    self.root = os.path.expanduser(root)

    self.output_ = np.memmap('{0}/output'.format(root), 'int64', 'r', shape=(60112, 62))
    self.out_len = np.memmap('{0}/output-lengths'.format(root), 'int32', 'r', shape=(60112))
    self.input_ = np.memmap('{0}/input'.format(root), 'float32', 'r', shape=(60112, 512, 1024))
    self.in_len = np.memmap('{0}/input-lengths'.format(root), 'int32', 'r', shape=(60112))


  def __len__(self):
    return self.num_pattern

  def __getitem__(self, index):
    return (self.in_len[index], torch.from_numpy(self.input_[index])), (self.out_len[index], torch.from_numpy(self.output_[index]))

if __name__ == '__main__':
  dataset = CustomDataset(root='/content/')
  data_loader = data.DataLoader(dataset, batch_size=32, shuffle=False, num_workers=1)
  for i, data in enumerate(data_loader, 0):
    # training

错误堆栈如下：

RuntimeError                              Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _try_get_batch(self, timeout)
    510         try:
--> 511             data = self.data_queue.get(timeout=timeout)
    512             return (True, data)

9 frames
/usr/lib/python3.6/multiprocessing/queues.py in get(self, block, timeout)
    103                     timeout = deadline - time.monotonic()
--> 104                     if not self._poll(timeout):
    105                         raise Empty

/usr/lib/python3.6/multiprocessing/connection.py in poll(self, timeout)
    256         self._check_readable()
--> 257         return self._poll(timeout)
    258 

/usr/lib/python3.6/multiprocessing/connection.py in _poll(self, timeout)
    413     def _poll(self, timeout):
--> 414         r = wait([self], timeout)
    415         return bool(r)

/usr/lib/python3.6/multiprocessing/connection.py in wait(object_list, timeout)
    910             while True:
--> 911                 ready = selector.select(timeout)
    912                 if ready:

/usr/lib/python3.6/selectors.py in select(self, timeout)
    375             try:
--> 376                 fd_event_list = self._poll.poll(timeout)
    377             except InterruptedError:

/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/signal_handling.py in handler(signum, frame)
     62         # Python can still get and update the process status successfully.
---> 63         _error_if_any_worker_fails()
     64         if previous_handler is not None:

RuntimeError: DataLoader worker (pid 3978) is killed by signal: Bus error. 

During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)
<ipython-input-8-b407a8532808> in <module>()
      5   data_loader = data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
      6 
----> 7   for i, data in enumerate(data_loader, 0):
      8     print(i)

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    574         while True:
    575             assert (not self.shutdown and self.batches_outstanding > 0)
--> 576             idx, batch = self._get_batch()
    577             self.batches_outstanding -= 1
    578             if idx != self.rcvd_idx:

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _get_batch(self)
    551         else:
    552             while True:
--> 553                 success, data = self._try_get_batch()
    554                 if success:
    555                     return data

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _try_get_batch(self, timeout)
    517             if not all(w.is_alive() for w in self.workers):
    518                 pids_str = ', '.join(str(w.pid) for w in self.workers if not w.is_alive())
--> 519                 raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str))
    520             if isinstance(e, queue.Empty):
    521                 return (False, None)

RuntimeError: DataLoader worker (pid(s) 3978) exited unexpectedly

Answer 1

这是一个共享内存错误，您的数据加载器可能需要更多内存才能完成此特定任务

Answer 2

RuntimeError: DataLoader worker (pid(s) 3978) exited unexpectedly

这个错误是因为，在 data.DataLoader(dataset, batch_size=32, shuffle=False, num_workers=1) make num_workers=0 中，它说你的 subprocesses 中没有 cpu

改组时DataLoader崩溃

2 个答案: