我的代码在 Ubuntu 上的 12 核 vcpu 上使用了超过 10000 个文件的多处理。
def process_file(name):
inp = open(name)
out = open(name.split('.')[0]+'wikiout.txt','a')
for row in inp:
row = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
# process sentence
inp.close()
out.close()
if __name__ == '__main__':
processes = []
for i in 'ABCDEF':
for j in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
for k in range(100)
filename = os.path.join(os.path.dirname(__file__), (i + j + '/' + 'wiki_' + str(k) + '.txt'))
p = multiprocessing.Process(target=process_file, args=(filename,))
processes.append(p)
p.start()
for process in processes:
process.join()
出于某种原因,我遇到了这个问题
File "wikirules.py", line 37, in <module>
p.start()
File "/usr/lib/python3.8/multiprocessing/process.py", line 121, in start
self._popen = self._Popen(self)
File "/usr/lib/python3.8/multiprocessing/context.py", line 224, in _Popen
return _default_context.get_context().Process._Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 69, in _launch
child_r, parent_w = os.pipe()
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
File "wikirules.py", line 13, in process_file
File "/usr/local/lib/python3.8/dist-packages/nltk/tokenize/__init__.py", line 106, in sent_tokenize
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 752, in load
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 877, in _open
File "/usr/local/lib/python3.8/dist-packages/nltk/data.py", line 327, in open
OSError: [Errno 24] Too many open files: '/root/nltk_data/tokenizers/punkt/PY3/english.pickle'
知道为什么会发生这种情况吗?我还是多处理新手。所以这不应该一次打开超过 12 个文件。
答案 0 :(得分:1)
您的代码正在尝试运行
len('ABCDEF') * len('ABCD...Z') * len(range(100)) = 6 * 26 * 100 = 15 600
操作系统进程同时进行。
实际上 multiprocessing
模块包含相对低级的原语来处理多处理,对于基本任务标准库建议更安全和方便的选项 - 模块 concurrent.futures
包含线程和进程的池实现,并且可以非常有用,特别是对于“令人尴尬的并行”工作负载。
以下是如何使用 concurrent.futures
和其他一些 Python 功能(例如生成器、上下文管理器和 pathlib
模块)转换您问题中的代码的示例。
import concurrent.futures as futures
import itertools
import pathlib
import nltk
BASE_PATH = pathlib.Path(__file__).parent.absolute()
def filename_generator():
"""produce filenames sequence"""
for i, j, k in itertools.product("ABCDEF", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", range(100)):
yield BASE_PATH / f"{i}{j}/wiki_{k}.txt"
def worker(filename: pathlib.Path):
"""do all the job"""
out_filename = filename.with_suffix('.wikiout.txt')
with open(filename) as inp, open(out_filename, "a") as out:
for row in inp:
text = row.strip()
sent_text = nltk.sent_tokenize(text)
for sent in sent_text:
"""process sentence"""
def main():
with futures.ProcessPoolExecutor() as pool:
# mapping future->filename, useful in case of error
task_to_filename = {pool.submit(worker, f): f for f in filename_generator()}
for f in futures.as_completed(task_to_filename):
try:
f.result()
except Exception as e:
filename = task_to_filename[f]
print(f"{filename} processing failed: {e}")
if __name__ == "__main__":
main()