Python Apply_async不等待其他进程完成

时间:2015-02-26 17:38:24

标签: python parallel-processing multiprocessing

我有以下示例代码,我正在尝试使用多处理模块。以下语句之前已在其他应用程序中工作,但是一个进程(仅仅因为分解而接收到非常少量的数据)首先完成并导致程序完成。有人可以帮助我理解为什么这不等于其他人吗?

def mpProcessor(basePath, jsonData, num_procs = mp.cpu_count()):
manager = mp.Manager()
map = manager.dict()
procs = mp.Pool(processes = num_procs, maxtasksperchild = 1)
chunkSize = len(jsonData) / (num_procs)
dataChunk = [(i, i + chunkSize) for i in range(0, len(jsonData), chunkSize)]
count = 1
for i in dataChunk:
    print 'test'
    s, e = i
    procs.apply_async(processJSON, args = (count, basePath, jsonData[s:e]))
    count += 1
procs.close()
procs.join()
return map

def processJSON(proc, basePath, records):
print 'Spawning new process: %d' %os.getpid()
outDict = dict()
print len(records)
for i in range(len(records)):
    valid = False
    idx = 0
    while valid == False:
        jsonObject = json.loads(records[i][1])['results'][idx]
        if jsonObject['kind'] == 'song':
            valid = True
            break
        else:
            idx += 1
    tunesTrack = Track()
    tunesTrack.setTrackId(jsonObject['trackId'])
print 'Finished processing %d records with process %d' %(len(records), os.getpid())

1 个答案:

答案 0 :(得分:1)

你似乎在重新发明轮子。

通过在池中使用初始化程序并使用map而不是apply_async,可以更轻松地实现您尝试执行的操作。因为它代表您的代码片段不可运行,所以我无法确定实际问题是什么。但是,以下内容应简化您的代码并使其更容易调试。

import math
import multiprocessing as mp

def pool_init(basePath_):
    global basePath, job_count
    basePath = basePath_
    job_count = 0
    print 'Spawning new process: %d' %os.getpid()

def mpProcessor(basePath, jsonData, num_procs=mp.cpu_count()):
    pool = mp.Pool(processes=num_procs, initializer=pool_init, initargs=(basePath,))
    # could specify a chunksize, but multiprocessing works out the optimal chunksize
    return pool.map(processJSON, jsonData)

# change processJSON to work with single records and
# remove proc and basePath args (as not needed)
def processJSON(record):
    global job_count
    print 'Starting job %d in process: %d' % (job_count, os.getpid())
    valid = False
    idx = 0
    while valid == False:
        jsonObject = json.loads(record[1])['results'][idx]
        if jsonObject['kind'] == 'song':
            valid = True
            break
        else:
            idx += 1
    tunesTrack = Track()
    tunesTrack.setTrackId(jsonObject['trackId'])
    print 'Finished processing job %d with process %d' % (job_count,  os.getpid())
    job_count += 1