我有很多文件(300~500)要阅读,我想加快这项任务。
理想化是:
from multiprocessing import Pool
import os
import _io
filelist = map(open,os.listdir())
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(_io.TextIOWrapper.read,filelist)
当然,我收到了一个错误:
TypeError: cannot serialize '_io.TextIOWrapper' object
问题是:我可以通过并行加速I / O流程吗?如果是,如何?
现在我找到了并行性的方法,并测试了我的代码:
我使用了22个项目,总计63.2 MB
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return f.read()
def mul():
with Pool() as pool:
a = pool.map(my_read, os.listdir())
def single():
a = []
for i in os.listdir():
with open(i) as f:
r = f.read()
a.append(r)
if __name__ == '__main__':
mul()
# single()
可悲的是,single()
费用为0.4,而mul()
费用为0.8。
有些人说这是一个IO限制任务,所以我无法通过并行性来改进它。 但是,我可以在Python doc中找到这些单词:
However, threading is still an appropriate model if you want to run multiple I/O-bound tasks simultaneously.
完整的代码在这里:
我的目的是将Epub
转移到txt
。
我已并行char2text
,现在我想加速readall
:
import zipfile
from multiprocessing import Pool
import bs4
def char2text(i):
soup = bs4.BeautifulSoup(i)
chapter = soup.body.getText().splitlines()
chapter = "\n".join(chapter).strip() + "\n\n"
return chapter
class Epub(zipfile.ZipFile):
def __init__(self, file, mode='r', compression=0, allowZip64=False):
zipfile.ZipFile.__init__(self, file, mode, compression, allowZip64)
if mode == 'r':
self.opf = self.read('OEBPS/content.opf').decode()
opf_soup = bs4.BeautifulSoup(self.opf)
self.author = opf_soup.find(name='dc:creator').getText()
self.title = opf_soup.find(name='dc:title').getText()
try:
self.description = opf_soup.find(name='dc:description').getText()
except:
self.description = ''
try:
self.chrpattern = opf_soup.find(name='dc:chrpattern').getText()
except:
self.chrpattern = ''
self.cover = self.read('OEBPS/images/cover.jpg')
elif mode == 'w':
pass
def get_text(self):
self.tempread = ""
charlist = self.readall(self.namelist())
with Pool() as pool:
txtlist = pool.map(char2text, charlist)
self.tempread = "".join(txtlist)
return self.tempread
def readall(self, namelist):
charlist = []
for i in namelist:
if i.startswith('OEBPS/') and i.endswith('.xhtml'):
r = self.read(i).decode()
charlist.append(r)
return charlist
def epub2txt(self):
tempread = self.get_text()
with open(self.title + '.txt', 'w', encoding='utf8') as f:
f.write(tempread)
if __name__ == "__main__":
e = Epub("assz.epub")
import cProfile
cProfile.run("e.epub2txt()")
答案 0 :(得分:0)
您尝试过类似的事情吗?
from multiprocessing import Pool
import os
import _io
def my_read(file_name):
with open(file_name) as f:
return _io.TextIOWrapper.read(f)
if __name__ == '__main__':
with Pool() as pool:
a = pool.map(my_read, os.listdir('some_dir'))
在子流程中打开/关闭文件听起来更合乎逻辑,字符串很容易序列化。
为readall方法尝试:
def readall(self, namelist):
filter_func = lambda i: i.startswith('OEBPS/') and i.endswith('.xhtml')
read_fun= lambda i: self.read(i).decode()
with Pool() as pool:
a = pool.map(read_fun, filter(filter_func, namelist))
return a