我使用多处理来划分任务分成子过程。我通过定制多处理管理器来注册defaultdict(list),创建了一个共享字典。子过程更新此字典使用追加值,但字典只包含的键,对应的值丢失。
from collections import defaultdict
from multiprocessing.managers import BaseManager, DictProxy
import multiprocessing as mp, io
class MyManager(BaseManager):
pass
def process_read(chunkStart, chunkSize, doc_cluster, f_path):
lock = mp.Lock()
with io.open(f_path) as f_handle:
f.seek(chunkStart)
lines = f.read(chunkSize).splitlines()
for line in lines:
f_name = line.rstrip()
key = some_processing()
lock.acquire()
try:
doc_cluster[key].append(f_name)
finally:
lock.release()
if __name__ == '__main__':
results = []
cores = mp.cpu_count()
pool = mp.Pool(cores)
fp = 'B:/FN/FN_test.txt'
MyManager.register('defaultdict', defaultdict, DictProxy)
mgr = MyManager()
mgr.start()
d_cluster = mgr.defaultdict(list)
for chunk_Start, chunk_Size in chunkify(fp):
results.append(pool.apply_async(process_read, args=(chunk_Start, chunk_Size, d_cluster, fp,)))
pool.close()
pool.join()
print d_cluster
实际输出:
{ 'CD':[], 'AB':[], 'BC':[]}
预期输出:
{ 'CD':[f_name1,f_name2,f_name3], 'AB':[f_name5,f_name7], 'BC':[f_name4,f_name6]}