首先看下面的代码:
pool = multiprocessing.Pool(processes=N)
batch = []
for item in generator():
batch.append(item)
if len(batch) == 10:
pool.apply_async(my_fun, args=(batch,))
batch = []
# leftovers
pool.apply_async(my_fun, args=(batch,))
基本上,我要从生成器中检索数据,收集到一个列表中,然后生成一个使用这批数据的进程。
这看起来不错,但是当使用者(即池进程)比生产者(即生成器)慢时,主进程的内存使用量会增长,直到生成器停止或...系统内存不足。 / p>
如何避免这个问题?
答案 0 :(得分:2)
在这种情况下,您可能要使用大小有限的队列。
%%javascript
Jupyter.keyboard_manager.command_shortcuts.add_shortcut('e', {
handler : function (event) {
function callback(msg) {
cell.set_text(msg.content.text);
}
var cell = Jupyter.notebook.get_selected_cell();
// Quote the cell text and *then* double any backslashes.
var cell_text = JSON.stringify(cell.get_text()).replace(/\\/g, "\\\\");
var cmd = `exec("""
cell_text = ${cell_text}
ext = "${cell.cell_type == 'code' ? 'py' : 'txt'}"
sep = "#-#-# under edit in file "
prefix, _, fname = cell_text.partition(sep)
if not fname or prefix:
# Create file and open editor, pass back placeholder.
import itertools, subprocess
for i in itertools.count():
fname = 'cell_{}.{}'.format(i, ext)
try:
with open(fname, 'x') as f:
f.write(cell_text)
except FileExistsError:
pass
else:
break
# Run editor in the background.
subprocess.Popen(['emacsclient', '-c', fname],
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
print(sep, fname, sep='', end='')
else:
# Cell has been in edit: read it in and pass it back, and delete it.
import os
try:
with open(fname, 'r') as f:
cell_text = f.read()
except FileNotFoundError:
print("# File {} could not be inserted back.".format(fname), end='')
else:
if cell_text.endswith('\\\\n'):
cell_text = cell_text[:-1]
print(cell_text, end='')
os.remove(fname)
try:
os.remove(fname + '~')
except FileNotFoundError:
pass
""", None, {})`;
Jupyter.notebook.kernel.execute(cmd, {iopub: {output: callback}},
{silent: false});
return false;
}}
);
与最大大小,这将为您提供必要的计数,并在线程填满时阻止正在调用q.put()的线程,因此您永远不能在其上发布超过一定数量的工作项,从而限制了存储所需的内存待处理的项目。
或者,您可以使用计数信号量(例如multiprocessing.BoundedSemaphore(maxSize))。每次从生成器获取工作项时都对其进行获取,并在处理该工作项后将其释放到工作函数(my_fun)中。这样,等待处理的工作项的最大数量将永远不会超过信号量的初始值。
答案 1 :(得分:1)
使用grouper
itertools配方对来自生成器的数据进行压缩。
使用concurrent futures中的基础结构来处理任务提交和流程检索。
您可以
设置(尝试模拟您的过程):
import concurrent.futures
import itertools, time, collections, random
from pprint import pprint
# from itertools recipes
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)
# generator/iterator facsimile
class G:
'''Long-winded range(n)'''
def __init__(self, n=108):
self.n = n
self.a = []
def __iter__(self):
return self
def __next__(self):
#self.a.append(time.perf_counter())
if self.n < 0:
raise StopIteration
x = self.n
self.n -= 1
return x
def my_func(*args):
time.sleep(random.randint(1,10))
return sum(*args)
等待任务组完成
if __name__ == '__main__':
nworkers = 4
g = G()
# generate data three-at-a-time
data = grouper(g, 3, 0)
results = []
fs = []
with concurrent.futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
for args in data:
print(f'pending:{len(executor._pending_work_items)}')
# block submission - limit pending tasks to conserve resources (memory)
if len(executor._pending_work_items) == nworkers:
# wait till all complete and get the results
futures = concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
#print(futures)
results.extend(future.result() for future in futures.done)
fs = list(futures.not_done)
# add a new task
fs.append(executor.submit(my_func, args))
# data exhausted - get leftover results as they finish
for future in concurrent.futures.as_completed(fs):
print(f'pending:{len(executor._pending_work_items)}')
result = future.result()
results.append(result)
pprint(results)
保持进程池满。
if __name__ == '__main__':
nworkers = 4
g = G()
# generate data three-at-a-time
data = grouper(g, 3, 0)
results = []
fs = []
with concurrent.futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
for args in data:
print(f'pending:{len(executor._pending_work_items)}')
# block submission - limit pending tasks to conserve resources (memory)
if len(executor._pending_work_items) == nworkers:
# wait till one completes and get the result
futures = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_COMPLETED)
#print(futures)
results.extend(future.result() for future in futures.done)
fs = list(futures.not_done)
# add a new task
fs.append(executor.submit(my_func, args))
# data exhausted - get leftover results as they finish
for future in concurrent.futures.as_completed(fs):
print(f'pending:{len(executor._pending_work_items)}')
result = future.result()
results.append(result)
pprint(results)