Question

首先看下面的代码：

pool = multiprocessing.Pool(processes=N)
batch = []
for item in generator():
    batch.append(item)
    if len(batch) == 10:
        pool.apply_async(my_fun, args=(batch,))
        batch = []
# leftovers
pool.apply_async(my_fun, args=(batch,))

基本上，我要从生成器中检索数据，收集到一个列表中，然后生成一个使用这批数据的进程。

这看起来不错，但是当使用者（即池进程）比生产者（即生成器）慢时，主进程的内存使用量会增长，直到生成器停止或...系统内存不足。 / p>

如何避免这个问题？

Answer 1

在这种情况下，您可能要使用大小有限的队列。

%%javascript

Jupyter.keyboard_manager.command_shortcuts.add_shortcut('e', {
    handler : function (event) {
        function callback(msg) {
            cell.set_text(msg.content.text);
        }
        var cell = Jupyter.notebook.get_selected_cell();
        // Quote the cell text and *then* double any backslashes.
        var cell_text = JSON.stringify(cell.get_text()).replace(/\\/g, "\\\\");
        var cmd = `exec("""
cell_text = ${cell_text}
ext = "${cell.cell_type == 'code' ? 'py' : 'txt'}"
sep = "#-#-# under edit in file "
prefix, _, fname = cell_text.partition(sep)

if not fname or prefix:
    # Create file and open editor, pass back placeholder.
    import itertools, subprocess

    for i in itertools.count():
        fname = 'cell_{}.{}'.format(i, ext)
        try:
            with open(fname, 'x') as f:
                f.write(cell_text)
        except FileExistsError:
            pass
        else:
            break

    # Run editor in the background.
    subprocess.Popen(['emacsclient', '-c', fname],
                     stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

    print(sep, fname, sep='', end='')
else:
    # Cell has been in edit: read it in and pass it back, and delete it.
    import os

    try:
        with open(fname, 'r') as f:
            cell_text = f.read()
    except FileNotFoundError:
        print("# File {} could not be inserted back.".format(fname), end='')
    else:
        if cell_text.endswith('\\\\n'):
            cell_text = cell_text[:-1]
        print(cell_text, end='')
        os.remove(fname)
        try:
            os.remove(fname + '~')
        except FileNotFoundError:
            pass
""", None, {})`;
        Jupyter.notebook.kernel.execute(cmd, {iopub: {output: callback}},
                                        {silent: false});
        return false;
    }}
);

与最大大小，这将为您提供必要的计数，并在线程填满时阻止正在调用q.put（）的线程，因此您永远不能在其上发布超过一定数量的工作项，从而限制了存储所需的内存待处理的项目。

或者，您可以使用计数信号量（例如multiprocessing.BoundedSemaphore（maxSize））。每次从生成器获取工作项时都对其进行获取，并在处理该工作项后将其释放到工作函数（my_fun）中。这样，等待处理的工作项的最大数量将永远不会超过信号量的初始值。

Answer 2

使用grouper itertools配方对来自生成器的数据进行压缩。

使用concurrent futures中的基础结构来处理任务提交和流程检索。

您可以

提交一组任务；等待他们完成；然后提交另一个组，或者
每次完成一项任务时都要提交一个新任务，使管道保持满满状态。

设置（尝试模拟您的过程）：

import concurrent.futures
import itertools, time, collections, random
from pprint import pprint

# from itertools recipes
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)

# generator/iterator facsimile
class G:
    '''Long-winded range(n)'''
    def __init__(self, n=108):
        self.n = n
        self.a = []
    def __iter__(self):
        return self
    def __next__(self):
        #self.a.append(time.perf_counter())
        if self.n < 0:
            raise StopIteration
        x = self.n
        self.n -= 1
        return x

def my_func(*args):
    time.sleep(random.randint(1,10))
    return sum(*args)

等待任务组完成

if __name__ == '__main__':
    nworkers = 4
    g = G()
    # generate data three-at-a-time
    data = grouper(g, 3, 0)
    results = []
    fs = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
        for args in data:
            print(f'pending:{len(executor._pending_work_items)}')
            # block submission - limit pending tasks to conserve resources (memory) 
            if len(executor._pending_work_items) == nworkers:
                # wait till all complete and get the results
                futures = concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
                #print(futures)
                results.extend(future.result() for future in futures.done)
                fs = list(futures.not_done)
            # add a new task
            fs.append(executor.submit(my_func, args))
        # data exhausted - get leftover results as they finish
        for future in concurrent.futures.as_completed(fs):
            print(f'pending:{len(executor._pending_work_items)}')
            result = future.result()
            results.append(result)

    pprint(results)

保持进程池满。

if __name__ == '__main__':
    nworkers = 4
    g = G()
    # generate data three-at-a-time
    data = grouper(g, 3, 0)
    results = []
    fs = []
    with concurrent.futures.ProcessPoolExecutor(max_workers=nworkers) as executor:
        for args in data:
            print(f'pending:{len(executor._pending_work_items)}')
            # block submission - limit pending tasks to conserve resources (memory) 
            if len(executor._pending_work_items) == nworkers:
                # wait till one completes and get the result
                futures = concurrent.futures.wait(fs, return_when=concurrent.futures.FIRST_COMPLETED)
                #print(futures)
                results.extend(future.result() for future in futures.done)
                fs = list(futures.not_done)
            # add a new task
            fs.append(executor.submit(my_func, args))
        # data exhausted - get leftover results as they finish
        for future in concurrent.futures.as_completed(fs):
            print(f'pending:{len(executor._pending_work_items)}')
            result = future.result()
            results.append(result)

    pprint(results)

多处理池和生成器

2 个答案: