我是多处理的新手,我已经编写了下面的脚本,但这些方法没有被调用。我不明白我错过了什么。
我想做的是以下内容:
在另一个之前调用一个方法。
# import all necessary modules
import Queue
import logging
import multiprocessing
import time, sys
import signal
debug = True
def init_worker():
signal.signal(signal.SIGINT, signal.SIG_IGN)
research_name_id = {}
ids = [55, 125, 428, 429, 430, 895, 572, 126, 833, 502, 404]
# declare all the static variables
num_threads = 2 # number of parallel threads
minDelay = 3 # minimum delay
maxDelay = 7 # maximum delay
# declare an empty queue which will hold the publication ids
queue = Queue.Queue(0)
proxies = []
#print (proxies)
def split(a, n):
"""Function to split data evenly among threads"""
k, m = len(a) / n, len(a) % n
return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]
for i in xrange(n))
def run_worker(
i,
data,
queue,
research_name_id,
proxies,
debug,
minDelay,
maxDelay):
""" Function to pull out all publication links from nist
data - research ids pulled using a different script
queue - add the publication urls to the list
research_name_id - dictionary with research id as key and name as value
proxies - scraped proxies
"""
print 'getLinks', i
for d in data:
print d
queue.put(d)
def fun_worker(i, queue, proxies, debug, minDelay, maxDelay):
print 'publicationData', i
try:
print queue.pop()
except:
pass
def main():
print "Initializing workers"
pool = multiprocessing.Pool(num_threads, init_worker)
distributed_ids = list(split(list(ids), num_threads))
for i in range(num_threads):
data_thread = distributed_ids[i]
print data_thread
pool.apply_async(run_worker, args=(i + 1,
data_thread,
queue,
research_name_id,
proxies,
debug,
minDelay,
maxDelay,
))
pool.apply_async(fun_worker,
args=(
i + 1,
queue,
proxies,
debug,
minDelay,
maxDelay,
))
try:
print "Waiting 10 seconds"
time.sleep(10)
except KeyboardInterrupt:
print "Caught KeyboardInterrupt, terminating workers"
pool.terminate()
pool.join()
else:
print "Quitting normally"
pool.close()
pool.join()
if __name__ == "__main__":
main()
我得到的唯一输出是
Initializing workers
[55, 125, 428, 429, 430, 895]
[572, 126, 833, 502, 404]
Waiting 10 seconds
Quitting normally
答案 0 :(得分:0)
有几个问题:
multiprocessing.Queue
apply_async
等与子流程共享队列,则需要使用经理(see example)。但是,你应该退后一步,问问自己你想做什么。 apply_async
真的要走了吗?您有一个要反复映射的项目列表,应用一些计算密集的长时间运行的转换(因为如果它们只是阻塞I / O,您也可以使用线程)。在我看来,imap_unordered
实际上是你想要的:
pool = multiprocessing.Pool(num_threads, init_worker)
links = pool.imap_unordered(run_worker1, ids)
output = pool.imap_unordered(fun_worker1, links)
需要修改 run_worker1
和fun_worker1
以获取单个参数。如果您需要共享其他数据,那么您应该在初始化程序中传递它,而不是一遍又一遍地将它传递给子进程。