我的代码如下所示。它似乎是"悬挂"在proc.join()
循环期间。如果我用10条记录创建数据框df
,整个过程快速完成,但从10000开始(如图所示),程序似乎只是挂起。我正在使用htop
来查看CPU内核的使用情况,我确实看到所有这些内容都飙升至100%,但是在它们回归到0%之后很久,该程序似乎没有继续。关于我做错什么的任何想法?
import pandas as pd
import numpy as np
import multiprocessing
from multiprocessing import Process, Queue
def do_something(df, partition, q):
for index in partition:
q.put([v for v in df.iloc[index]])
def start_parallel_processing(df, partitions):
q = Queue()
procs = []
results = []
for partition in partitions:
proc = Process(target=do_something, args=(df, partition, q))
proc.start()
procs.extend([proc])
for i in range(len(partitions)):
results.append(q.get(True))
for proc in procs:
proc.join()
return results
num_cpus = multiprocessing.cpu_count()
df = pd.DataFrame([(x, x+1) for x in range(10000)], columns=['x','y'])
partitions = np.array_split(df.index, num_cpus)
results = start_parallel_processing(df, partitions)
len(results)
答案 0 :(得分:1)
Queue.Queue
似乎没有按照您的意愿行事,而且不是为了在多个流程之间共享,而是必须使用Manager.Queue()
我添加了一些打印来理解你的代码流,
您仍然可以使用Pool()
代替num_cpus
import pandas as pd
import numpy as np
import multiprocessing
import pprint
from multiprocessing import Process, Queue, Manager
def do_something(df, partition, q):
# print "do_something " + str(len(partition)) + " times"
for index in partition:
# print index
for v in df.iloc[index]:
#print "sending v to queue: " + str(len(df.iloc[index]))
q.put(v, False)
print "task_done(), qsize is "+ str(q.qsize())
def start_parallel_processing(df, partitions):
m = Manager()
q = m.Queue()
procs = []
results = []
print "START: launching "+ str(len(partitions)) + " process(es)"
index = 0
for partition in partitions:
print "launching "+ str(len(partitions)) + " process"
proc = Process(target=do_something, args=(df, partition, q))
procs.extend([proc])
proc.start()
index += 1
print "launched "+ str(index) + "/" + str(len(partitions)) + " process(es)"
while True:
try:
results.append(q.get( block=False ))
except:
print "QUEUE END"
break
print pprint.pformat(results)
process_count = 0
for proc in procs:
process_count += 1
print "joining "+ str(process_count) + "/" + str(len(procs)) + " process(es)"
proc.join()
return results
num_cpus = multiprocessing.cpu_count()
df = pd.DataFrame([(x, x+1) for x in range(10000)], columns=['x','y'])
partitions = np.array_split(df.index, num_cpus)
results = start_parallel_processing(df, partitions)
print "len(results) is: "+ str(len(results))