我有一个生成器,它从NLTK / spaCy返回文件名和NLP任务。我想并行执行此任务(对于每个文档),即并行处理k文档。
如何在python中执行此操作?多处理包或asyncIo是可行的吗?还是pyAkka?
返回的结果将被添加到倒排索引中,如下例所示。
def gen_items():
print("Yield 0")
yield (0, 'Text 0')
print("Yield 1")
yield (1, 'text 1')
print("Yield 2")
yield (2, 'Text 2')
gen1, gen2 = itertools.tee(gen_items())
ids = (id_ for (id_, text) in gen1)
texts = (text for (id_, text) in gen2)
docs = nlp.pipe(texts, batch_size=50, n_threads=4)
d = {}
for id_, doc in zip(ids, docs):
print('id ' + str(id_))
for token in doc:
# print('token ' + str(token) + ' orth ' + token.orth_)
if token.is_alpha and not token.is_stop and len(token.orth_) > 1:
strtok = token.orth_.strip()
if strtok not in d.keys():
d[strtok] = {id_}
elif strtok in d.keys():
d[strtok].add(id_)