我尝试将1张图片的5个标题连接成一个字符串,并使用spacy对多进程中的单词进行词素化。但是当我使用
nlp = en_core_web_sm.load()
line = nlp(line)
它卡住了...代码后面将提供更多信息。
import torchvision.datasets as dset
import torchvision.transforms as transforms
import pdb
import en_core_web_sm
import os
from multiprocessing import Pool, Manager
from itertools import islice
from smart_open import smart_open
from tqdm import tqdm
cap_val = dset.CocoCaptions(root = '../coco/images/val2014/',
annFile = '../coco/annotation/captions_val2014.json',
transform=transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.ToTensor()]
))
cap_train = dset.CocoCaptions(root = '../coco/images/train2014/',
annFile = '../coco/annotation/captions_train2014.json',
transform=transforms.Compose([
transforms.Resize(224),
transforms.CenterCrop(224),
transforms.ToTensor()]
)) # Load data from coco dataset
nlp = en_core_web_sm.load()
def pre_lemmatize(ann_list):
piece_size = int(len(ann_list) / 6)
p_lem = Pool()
remain = len(ann_list) - piece_size * 6
# for line in ann_list:
# print(nlp(line) # It still works fine here
for i in range(6 - remain): # Split the dataset and send to 6 different processes
start_line = i * piece_size
stop_line = (i + 1) * piece_size
p_lem.apply_async(lemmatize, args=(ann_list, start_line, stop_line))
for i in range(remain):
start_line = (6 - remain) * piece_size + i * (piece_size + 1)
stop_line = start_line + piece_size + 1
p_lem.apply_async(lemmatize, args=(ann_list, start_line, stop_line))
p_lem.close()
p_lem.join()
def lemmatize(ann_list, line_start, line_stop):
count = 1
line_current = line_start
# print(nlp(ann_list[0])) # Will stuck here
for line in ann_list[line_start:line_stop]:
line = nlp(line) # Will stuck here
word_lem_list = []
for token in line:
first_letter = str(token.pos_)[0]
if first_letter == 'J' or first_letter == 'V' or first_letter == 'N' or first_letter == 'F' or first_letter == 'R':
word_lem_list.append(token.lemma_)
ann_list[line_current] = word_lem_list
line_current += 1
print("Chunk %d/%d completed!" % (count, line_stop - line_start))
count += 1
ann_list = Manager().list()
for i in tqdm(range(20)): # Load 20 data from dataset
ann_list_sub = ''.join(cap_val[i][1]) # concatenate 5 captions for 1 picture into 1 string
ann_list.append(ann_list_sub)
pre_lemmatize(ann_list)
cap_val [i] [1]是一个包含5个字幕字符串的列表。
真正让我感到困惑的是,如果我尝试在多进程使用的函数(lemmatize)之外的an_list中对字符串进行编码,它仍然可以工作,但是一旦我尝试在函数“ lemmatize”中对字符串进行编码,就会卡住!不会出现任何错误,它会继续运行...
@ _ @不知道为什么会发生...
更新: 好的,可能是因为我以错误的方式使用了Manager()。list()...一旦我将串联的步骤放入'lemmatize'函数中,一切就变得正常了……有人可以告诉我为什么吗?猜猜是因为我不应该在多进程使用的函数之外对Manager()。list()的对象进行操作?