因此,我尝试在pytorch中使用DataLoader
来训练神经网络。在每个批次中,我想获取图像,蒙版和文件名。为了整理文件名,我编写了一个自定义整理功能。
由于某些原因,每当我在num_workers > 0
中设置DataLoader
时,代码都会崩溃,但在num_workers = 0
时可以正常工作。我收到类似以下的错误消息,它不是很有帮助:
RuntimeError:DataLoader工作程序(pid 10449)被信号终止:已终止。详细信息由于多处理而丢失。重新运行num_workers = 0可能会提供更好的错误跟踪。
有人知道这是怎么回事,如何在pytorch中调试多进程?
class MyDataset(Dataset):
def __init__(self, df=None):
self.folderlist = []
# Load path of files here... df is just a dataframe of which specifies which files to load.
def __len__(self):
return len(self.folderlist)
def __getitem__(self, idx):
imglist = os.listdir( self.folderlist[idx])
image_name = [x for x in imglist if x.endswith(('_img.jpg'))][0]
mask_name = [x for x in imglist if x.endswith(('_mask.png'))][0]
image = np.array(Image.open(os.path.join(self.folderlist[idx], image_name)))
mask = Image.open(os.path.join(self.folderlist[idx], mask_name))
# Some transforms here
image = transforms.ToTensor()(image)
mask = transforms.ToTensor()(mask)
return (image, mask, self.folderlist[idx]) # returns the filename as well
def collate_fn(batch, default=False, depth=0):
if default:
return default_collate(batch)
else:
if isinstance(batch[0], torch.Tensor):
if True:
# If we're in a background process, concatenate directly into a
# shared memory tensor to avoid an extra copy
numel = sum([x.numel() for x in batch])
storage = batch[0].storage()._new_shared(numel)
out = batch[0].new(storage)
return torch.stack(batch, 0, out=out)
elif isinstance(batch[0], int_classes):
return torch.LongTensor(batch)
elif isinstance(batch[0], float):
return torch.DoubleTensor(batch)
elif isinstance(batch[0], string_classes):
return batch
elif (depth==0):
# Puts the first two data fields into a tensor with outer dimension batch size
transposed = zip(*batch)
return [collate_fn(samples, depth=depth+1) for samples in transposed]
else:
return batch # Don't zip up second+ order lists
# Some class function
def train():
# ...
train_dataset = MyDataset(df=df_train)
train_loader = DataLoader(train_dataset, batch_size=self.config.TRAIN_BS, shuffle=True, num_workers=self.config.NUM_WORKERS, collate_fn=collate_fn)
val_dataset = MyDataset(df=df_val)
val_loader = DataLoader(val_dataset, batch_size=self.config.VAL_BS, shuffle=True, num_workers=self.config.NUM_WORKERS, collate_fn=collate_fn)
nfiles = 0
for i, (images, masks, filenames) in enumerate(train_loader):
nfiles += len(filenames)
print('train {}; nfiles: {}'.format(filenames, nfiles))
for i, (images, masks, filenames) in enumerate(val_loader):
print('val {}; nfiles: {}'.format(filenames, nfiles))
在通过train_loader完成迭代之后,程序崩溃。