我正在python中使用多处理来并行处理一些计算繁重的函数。但是我发现如果传递一个胖参数(例如1000个注释的networkx图或1000000个项目的列表),则创建过程会有延迟。我在两个多处理模块“ multiprocessing”和“ pathos”上进行了实验,得到了相似的结果。我的问题是如何避免这种延迟,因为它会破坏并行计算带来的好处。
在我的示例代码中,我只是将一个胖参数传递给函数以进行多处理-函数主体不会完全触碰该参数。
"2018-12-15T14:12:12"
上述示例代码的输出
import multiprocessing
import time
def f(args):
(x, conn, t0, graph) = args
ans = 1
x0 = x
t = time.time() - t0
conn.send('factorial of %d: start@%.2fs' % (x0, t))
while x > 1:
ans *= x
time.sleep(0.5)
x -= 1
t = time.time() - t0
conn.send('factorial of %d: finish@%.2fs, res = %d' %(x0, t, ans))
return ans
def main():
var = (4, 8, 12, 20, 16)
p = multiprocessing.Pool(processes = 4)
p_conn, c_conn = multiprocessing.Pipe()
params = []
t0 = time.time()
N = 1000
import networkx as nx
G = nx.complete_graph(N, nx.DiGraph())
import random
for (start, end) in G.edges:
G.edges[start, end]['weight'] = random.random()
for i in var:
params.append((i, c_conn, t0, G))
res = list(p.imap(f, params))
p.close()
p.join()
print('output:')
while p_conn.poll():
print(p_conn.recv())
t = time.time() - t0
print('factorial of %s@%.2fs: %s' % (var, t, res))
if __name__ == '__main__':
main()
根据以上输出,两个进程创建之间存在大约24秒的延迟
output:
factorial of 4: start@29.78s
factorial of 4: finish@31.29s, res = 24
factorial of 8: start@53.56s
factorial of 8: finish@57.07s, res = 40320
factorial of 12: start@77.25s
factorial of 12: finish@82.75s, res = 479001600
factorial of 20: start@100.39s
factorial of 20: finish@109.91s, res = 2432902008176640000
factorial of 16: start@123.55s
factorial of 16: finish@131.05s, res = 20922789888000
factorial of (4, 8, 12, 20, 16)@131.06s: [24, 40320, 479001600, 2432902008176640000, 20922789888000]
Process finished with exit code 0
上述示例代码的输出
import pathos
import multiprocess
import time
def f(x, conn, t0, graph):
ans = 1
x0 = x
t = time.time() - t0
conn.send('factorial of %d: start@%.2fs' % (x0, t))
while x > 1:
ans *= x
time.sleep(0.5)
x -= 1
t = time.time() - t0
conn.send('factorial of %d: finish@%.2fs, res = %d' %(x0, t, ans))
return ans
def main():
var = (4, 8, 12, 20, 16)
p = pathos.multiprocessing.ProcessPool(nodes=4)
p_conn, c_conn = multiprocess.Pipe()
t0 = time.time()
conn_s = [c_conn] * len(var)
t0_s = [t0] * len(var)
N = 1000
import networkx as nx
G = nx.complete_graph(N, nx.DiGraph())
import random
for (start, end) in G.edges:
G.edges[start, end]['weight'] = random.random()
res = list(p.imap(f, var, conn_s, t0_s, [G] * len(var)))
print('output:')
while p_conn.poll():
print(p_conn.recv())
t = time.time() - t0
print('factorial of %s@%.2fs: %s' % (var, t, res))
if __name__ == '__main__':
main()
类似地,根据上面的输出,两个进程创建之间大约有24秒的延迟。
如果我减小图的大小(较小的节点数),则延迟会相应减小。我想这是由于需要花额外的时间来腌制/莳萝networkx图作为参数。 理想情况下,应该同时创建前四个过程。如何避免这笔费用?谢谢!
更新
由于亚历山大(Alexander)的回答,我删除了“ multiprocessing”和“ pathos”代码中的管道。 “多重处理”代码的执行效果与亚历山大的延迟时间减少到1秒相同,但“悲痛”代码的延迟时间仍超过20秒。修改后的“ pathos”代码发布在下面,
output:
factorial of 4: start@29.63s
factorial of 4: finish@31.13s, res = 24
factorial of 8: start@53.50s
factorial of 8: finish@57.00s, res = 40320
factorial of 12: start@76.94s
factorial of 12: finish@82.44s, res = 479001600
factorial of 20: start@100.72s
factorial of 20: finish@110.23s, res = 2432902008176640000
factorial of 16: start@123.69s
factorial of 16: finish@131.20s, res = 20922789888000
factorial of (4, 8, 12, 20, 16)@131.20s: [24, 40320, 479001600, 2432902008176640000, 20922789888000]
Process finished with exit code 0
输出为
import pathos
import multiprocess
import time
from pympler import asizeof
import sys
def f(args):
(x, graph) = args
t = time.ctime()
print('factorial of %d: start@%s' % (x, t))
time.sleep(4)
return x
def main():
t0 = time.time()
params = []
var = (4, 8, 12, 20, 16)
p = pathos.multiprocessing.ProcessPool(nodes=4)
N = 1000
import networkx as nx
G = nx.complete_graph(N, nx.DiGraph())
import random
for (start, end) in G.edges:
G.edges[start, end]['weight'] = random.random()
print('Size of G by sys', sys.getsizeof(G), 'asizeof', asizeof.asizeof(G))
print('G created in %.2f' % (time.time() - t0))
for i in var:
params.append((i, G))
res = list(p.imap(f, params))
p.close()
p.join()
if __name__ == '__main__':
main()
答案 0 :(得分:1)
创建每个进程时,应将此胖参数(338 MB)复制到一个单独的内存中,但不要花那么长时间(24秒)。
这是我的计算机上的工作方式:
因此,我更改了代码:
import multiprocessing
import os
import time
import sys
from pympler import asizeof
import networkx as nx
import random
def factorial(args):
(x, t, graph) = args
s0 = '# pid %s x %2d' % (format(os.getpid()), x)
s1 = 'started @ %.2f' % (time.time() - t)
print(s0, s1)
f = 1
while x > 1:
f *= x
x -= 1
time.sleep(0.5)
s2 = 'ended @ %.2f' % (time.time() - t)
print(s0, s2, f)
return s0, s1, s2, f
if __name__ == '__main__':
t0 = time.time()
N = 1000
G = nx.complete_graph(N, nx.DiGraph())
for (start, end) in G.edges:
G.edges[start, end]['weight'] = random.random()
print('Size of G by sys', sys.getsizeof(G), 'asizeof', asizeof.asizeof(G))
print('G created in %.2f' % (time.time() - t0))
t0 = time.time()
p = multiprocessing.Pool(processes=4)
outputs = list(p.imap(factorial, [(i, t0, G) for i in (4, 8, 12, 20, 16)]))
print('output:')
for output in outputs:
print(output)
现在输出:
Size of G by sys 56 asizeof 338079824
G created in 13.03
# pid 2266 x 4 started @ 1.27
# pid 2267 x 8 started @ 1.98
# pid 2268 x 12 started @ 2.72
# pid 2266 x 4 ended @ 2.77 24
# pid 2269 x 20 started @ 3.44
# pid 2266 x 16 started @ 4.09
# pid 2267 x 8 ended @ 5.49 40320
# pid 2268 x 12 ended @ 8.23 479001600
# pid 2266 x 16 ended @ 11.60 20922789888000
# pid 2269 x 20 ended @ 12.95 2432902008176640000
output:
('# pid 2266 x 4', 'started @ 1.27', 'ended @ 2.77', 24)
('# pid 2267 x 8', 'started @ 1.98', 'ended @ 5.49', 40320)
('# pid 2268 x 12', 'started @ 2.72', 'ended @ 8.23', 479001600)
('# pid 2269 x 20', 'started @ 3.44', 'ended @ 12.95', 2432902008176640000)
('# pid 2266 x 16', 'started @ 4.09', 'ended @ 11.60', 20922789888000)
在11秒内创建了338 MB数据,是的,启动前4个进程确实需要时间。启动之间的延迟虽然小得多:0.71、0.74、0.72秒。我有配备Intel i5 @ 3.2 GHz的iMac。
没有可见延迟时,最大N为78:
Size of G by sys 56 asizeof 1970464
G created in 0.08
# pid 2242 x 4 started @ 0.01
# pid 2243 x 8 started @ 0.01
# pid 2244 x 12 started @ 0.01
# pid 2245 x 20 started @ 0.01
# pid 2242 x 4 ended @ 1.51 24
# pid 2242 x 16 started @ 1.53
# pid 2243 x 8 ended @ 3.52 40320
# pid 2244 x 12 ended @ 5.52 479001600
# pid 2242 x 16 ended @ 9.04 20922789888000
# pid 2245 x 20 ended @ 9.53 2432902008176640000
output:
('# pid 2242 x 4', 'started @ 0.01', 'ended @ 1.51', 24)
('# pid 2243 x 8', 'started @ 0.01', 'ended @ 3.52', 40320)
('# pid 2244 x 12', 'started @ 0.01', 'ended @ 5.52', 479001600)
('# pid 2245 x 20', 'started @ 0.01', 'ended @ 9.53', 2432902008176640000)
('# pid 2242 x 16', 'started @ 1.53', 'ended @ 9.04', 20922789888000)
答案 1 :(得分:0)
我将N更改为50,并使用PyCharm中的调试器运行“ pathos”代码。在“在7.79中创建G”之后停止。下面的输出证实了我对“悲痛”为什么变慢的怀疑。 Pathos使用连接和套接字对象(取决于平台)来传递参数并启动子进程。这就是为什么它这么慢的原因:大约30倍。从好的方面来说:它可以通过网络运行。
调试输出:
/usr/local/bin/python3.7 "/Applications/PyCharm CE.app/Contents/helpers/pydev/pydevd.py" --multiproc --qt-support=auto --client 127.0.0.1 --port 51876 --file /Users/alex/PycharmProjects/game/object_type.py
pydev debugger: process 1526 is connecting
Connected to pydev debugger (build 191.6605.12)
Size of G by sys 56 asizeof 57126904
G created in 7.79
Process ForkPoolWorker-3:
Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Process ForkPoolWorker-4:
Traceback (most recent call last):
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/pool.py", line 110, in worker
task = get()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/queues.py", line 354, in get
with self._rlock:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/pool.py", line 110, in worker
task = get()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/synchronize.py", line 102, in __enter__
return self._semlock.__enter__()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/queues.py", line 354, in get
with self._rlock:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/synchronize.py", line 102, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
KeyboardInterrupt
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/pool.py", line 110, in worker
task = get()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/queues.py", line 355, in get
res = self._reader.recv_bytes()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/connection.py", line 219, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/connection.py", line 410, in _recv_bytes
buf = self._recv(4)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/connection.py", line 382, in _recv
chunk = read(handle, remaining)
Traceback (most recent call last):
KeyboardInterrupt
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 297, in _bootstrap
self.run()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/pool.py", line 110, in worker
task = get()
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/queues.py", line 354, in get
with self._rlock:
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/synchronize.py", line 102, in __enter__
return self._semlock.__enter__()
KeyboardInterrupt
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/pool.py", line 733, in next
item = self._items.popleft()
IndexError: pop from an empty deque
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Applications/PyCharm CE.app/Contents/helpers/pydev/pydevd.py", line 1741, in <module>
main()
File "/Applications/PyCharm CE.app/Contents/helpers/pydev/pydevd.py", line 1735, in main
globals = debugger.run(setup['file'], None, None, is_module)
File "/Applications/PyCharm CE.app/Contents/helpers/pydev/pydevd.py", line 1135, in run
pydev_imports.execfile(file, globals, locals) # execute the script
File "/Applications/PyCharm CE.app/Contents/helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "/Users/alex/PycharmProjects/game/object_type.py", line 100, in <module>
outputs = list(p.imap(factorial, [(i, t0, G) for i in (4, 8, 12, 20, 16)]))
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/site-packages/multiprocess/pool.py", line 737, in next
self._cond.wait(timeout)
File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/threading.py", line 296, in wait
waiter.acquire()
KeyboardInterrupt
答案 2 :(得分:0)
在相关说明中:我在尝试将pandas数据帧作为参数传递给具有Joblib管理并行处理功能的函数时遇到了这个问题。
Joblib腌制参数以将信息传递给每个处理器。腌制大小适中(<1MB)的数据帧可能很耗时。在我的情况下,酸洗太糟糕了,以至于只有10-20名工人的joblib比一个简单的循环要慢。 但是,joblib可以更有效地处理列表,字典和np.array。因此,我发现一个简单的技巧是传递一个包含数据帧内容的列表作为np.array和列,并在函数中重新组合。
将param = [df.values,df.columns]传递给joblib的速度比简单地传递param = df快50倍。