我尝试使用python中的多处理功能将多页pdf拆分为许多单页pdf,但出现错误。我在stackover流上寻找了类似的帖子,但没有找到答案。
这是我的代码:
from PyPDF2 import PdfFileWriter, PdfFileReader
import os
from os import listdir
from os.path import isfile, join
from pdf2image import convert_from_path, convert_from_bytes
from wand.image import Image
import multiprocessing as mp
from functools import partial
def extract_page(pgnum, inputPDF, pdfFolder, file_name):
output = PdfFileWriter()
output.addPage(inputPDF.getPage(pgnum))
page = pdfFolder + "/" + file_name + "-%s.pdf" % (pgnum + 1)
with open(page, "wb") as outputStream:
output.write(outputStream)
def parallel_run(pages, inputPDF, pdfFolder, file_name):
pool = mp.Pool(2)
split_pdf = partial(extract_page, inputPDF = inputPDF, pdfFolder = pdfFolder, file_name = file_name)
results = pool.map(split_pdf, pages)
pool.close()
def splitPdf(file):
listPdf = []
inputpdf = PdfFileReader(open(file, "rb"))
file_name = os.path.basename(file)[:-4]
#create a folder to store splitted pdf
pdfFolder = os.path.dirname(file) +"/"+ file_name + "_split"
try:
os.mkdir(pdfFolder)
except OSError:
print("Directory %s is already existed" % pdfFolder)
listPdf = [pdfFolder + "/" + f for f in listdir(pdfFolder) if isfile(join(pdfFolder, f))]
listPdf = [f for f in listPdf if f.endswith(".pdf")]
else:
print("Successfully created the directory %s " % pdfFolder)
pgs = list(range(0, inputpdf.numPages))
parallel_run(pgs, inputpdf, pdfFolder, file_name)
listPdf = [pdfFolder + "/" + f for f in listdir(pdfFolder) if isfile(join(pdfFolder, f))]
listPdf = [f for f in listPdf if f.endswith(".pdf")]
return listPdf
这是我得到的错误。有没有人曾经遇到过这个问题?
Traceback (most recent call last):
File "C:/Users/tkdang/PycharmProjects/listSPDP/data_no_index2/split_pdf.py", line 79, in <module>
listPDF = splitPdf(path)
File "C:/Users/tkdang/PycharmProjects/listSPDP/data_no_index2/split_pdf.py", line 41, in splitPdf
parallel_run(pgs, inputpdf, pdfFolder, file_name)
File "C:/Users/tkdang/PycharmProjects/listSPDP/data_no_index2/split_pdf.py", line 22, in parallel_run
results = pool.map(split_pdf, pages)
File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py", line 290, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py", line 683, in get
raise self._value
File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\pool.py", line 457, in _handle_tasks
put(task)
File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\connection.py", line 206, in send
self._send_bytes(_ForkingPickler.dumps(obj))
File "C:\Users\tkdang\AppData\Local\Continuum\anaconda3\lib\multiprocessing\reduction.py", line 51, in dumps
cls(buf, protocol).dump(obj)
TypeError: cannot serialize '_io.BufferedReader' object