Question

以前也曾问过类似的问题，但是我的设置与其他问题略有不同。在下面的代码中，当我监视内存使用情况时，消耗不断增加。对于小文件来说，这没问题，但是我正在处理一个千兆字节的文件。最终，我将遇到MemoryError。

这个想法是，一个单独的读取器进程将成批的行读取到一个队列中。许多工作人员使用此队列并将其结果放在另一个队列中，而写入程序又将这些结果写入另一个输出队列，该写入器进程将这些结果写入输出文件。工人在完成后还返回值，然后这些值将由主过程使用。这主要是诊断信息（例如，已处理的行数），但我可以将其取回。

此处完成的工作是将文本通过should not have any memory leaks的spaCy管道。

我能想到的一件事是，每个子进程都使用自己的spaCy实例（是吗？），因此，字符串内存/词汇表是每个进程特定的。这意味着整个词汇表的大小基本上在所有子过程中都是重复的。真的吗？如果是这样，是否有一种方法可以在多个spaCy实例中仅使用一个“查找表” / Voc实例？如果这不是问题，您是否还有其他想法可能出什么问题？

import logging
import multiprocessing as mp
from os import cpu_count

from spacy.util import minibatch
import spacy

import psutil

logging.basicConfig(datefmt='%d-%b %H:%M:%S',
                    format='%(asctime)s - [%(levelname)s]: %(message)s',
                    level=logging.INFO,
                    handlers=[
                        logging.FileHandler('progress.log'),
                        logging.StreamHandler()
                    ])

NLP = spacy.load('en_core_web_sm', disable=['ner', 'textcat'])
N_WORKERS = cpu_count()-1 or 1

""" Process a large input file. A separate reader process puts batches of lines in a queue,
    picked up by workers who in turn process these lines. They put the results in a new queue and return some
    diagnostics information after finishing. The results are read by a separate writer process that writes the
    results to a new file.
"""


def reader(src, work_q, batch_size=1000):
    with open(src, encoding='utf-8') as fhin:
        lines = (line.strip() for line in fhin)
        # minibatch is a generator, and as such this approach
        # should be memory-lenient
        for batch_idx, batch in enumerate(minibatch(lines, batch_size), 1):
            work_q.put((batch, batch_idx))

    # Notify all workers that work is done
    for _ in range(N_WORKERS):
        work_q.put('done')

    logging.info('Done reading!')

def writer(results_q):
    with open('out.txt', 'w') as fhout:
        while True:
            # Get values from results queue; write those to a file
            m = results_q.get()
            if m == 'done':
                logging.info('Done writing everything to file!')
                break

            fhout.write('\n'.join(m) + '\n')
            fhout.flush()

    logging.info('Done writing!')

def spacy_process(texts, results_q):
    docs = list(NLP.pipe(texts))
    sents = [sent.text for doc in docs for sent in doc.sents]

    return sents, len(sents)

def _start_worker(work_q, results_q):
    # Keep track of some values, e.g. lines processed
    lines_processed = 0
    while True:
        m = work_q.get()
        if m == 'done':
            logging.info('Done reading from file!')
            break

        batch, batch_idx = m
        result, n_lines = spacy_process(batch, results_q)
        results_q.put(result)
        lines_processed += n_lines

        if batch_idx == 1 or batch_idx % 25 == 0:
            logging.info(f"Memory usage (batch #{batch_idx:,}):"
                         f" {psutil.virtual_memory().percent}%")

    logging.info('Workers is done working!')

    return lines_processed


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('fin', help='input file.')
    args = parser.parse_args()

    with mp.Manager() as manager, mp.Pool(N_WORKERS+2) as pool:
        logging.info(f"Started a pool with {N_WORKERS} workers")
        results_queue = manager.Queue(maxsize=N_WORKERS*10)
        work_queue = manager.Queue(maxsize=N_WORKERS*10)

        _ = pool.apply_async(writer, (results_queue, ))
        _ = pool.apply_async(reader, (args.fin, work_queue))

        worker_jobs = []
        for _ in range(N_WORKERS):
            job = pool.apply_async(_start_worker, (work_queue, results_queue))
            worker_jobs.append(job)

        # When a worker has finished its job, get its information back
        total_n_sentences = 0
        for job in worker_jobs:
            n_sentences = job.get()
            total_n_sentences += n_sentences

        # Notify the writer that we're done
        results_queue.put('done')

    logging.info(f"Finished processing {args.fin}. Processed {total_n_sentences} lines.")

我不确定为什么内存使用率持续上升。我已经限制了队列的大小，因此它不应超过设置的限制。我经常刷新编写器，以免缓冲区不会变大。那我想念什么？

使用multiprocessing.Queue的内存消耗稳定增长

0 个答案: