我正在尝试合并1000多个pdf页面,并且可以在750个页面以下使用。如果我打开的数量超过750,它将对其进行处理,但输出文件为0字节。
from PyPDF3 import PdfFileWriter, PdfFileReader, PdfFileMerger
import os
import sys
from collections import OrderedDict
import win32file
win32file._setmaxstdio(8192)
print(win32file._getmaxstdio())
sys.setrecursionlimit(30000)
nameOfFile = os.path.basename(os.getcwd())
#get page number
def getPageNr(arg1):
stro = str(arg1)
stro=stro.replace('.pdf', '')
listR = stro.split(' - ')
listR[len(listR)-1] = listR[len(listR)-1].replace('-','')
listR[len(listR)-1] = listR[len(listR)-1].replace('Page ','')
pgNr=int(listR[len(listR)-1])
return pgNr
currentFolder = os.getcwd()
pdffiles = [os.path.join(name)
for root, dirs, files in os.walk(currentFolder)
for name in files
if name.endswith((".pdf"))]
#create dictionary and get whole list
di={}
#direct copy and create key from page number on back and value is original list
for string in pdffiles:
di.setdefault(getPageNr(string),str(string))
#sort it by keys
di2 = OrderedDict(sorted(di.items()))
pdffiles.clear()
for key,values in di2.items():
pdffiles.append(values)
#put a correction
pageAt = 0
adder = 421
pageAt = pageAt + adder
#add global variables for page in bookmark
mainTitlePage = 0
secondTitlePage = 0
thirdTitlePage = 0
#define globals for bookmarks
mainTitle = ''
SecondTitle = ''
thirdTitle = ''
#define previous bookmarks
lastMainTitle = ''
lastSecondTitle = ''
lastThirdTitle = ''
#if main title is same as next page
isSame = True
#start Merger
editer = PdfFileMerger()
#start main loop
while pageAt<(adder+2000) and pageAt<len(pdffiles) and isSame:
#break filename to titles
titles = pdffiles[pageAt].split(' - ')
#break next page for titles
titlesNext = pdffiles[pageAt+1].split(' - ')
#get titles
mainTitle = titles[0]
secondTitle = titles[1]
if not titlesNext[0] == mainTitle:
isSame = False
hasThird = False
if len(titles)>4:
thirdTitle = titles[2]
hasThird = True
else:
thirdTitle = None
hasThird = False
#open individual page
kStream = open(pdffiles[pageAt], 'rb')
inputK = PdfFileReader(kStream)
#test if titles are changing
if not mainTitle == lastMainTitle:
KmainParent = editer.addBookmark(mainTitle, 0)
if not secondTitle == lastSecondTitle:
secondTitlePage = pageAt-adder
#print(secondTitle)
Kparent = editer.addBookmark(secondTitle, secondTitlePage, KmainParent)
if hasThird:
if not thirdTitle == lastThirdTitle:
thirdTitlePage = pageAt-adder
Mparent = editer.addBookmark(thirdTitle, thirdTitlePage, Kparent)
editer.addBookmark(titles[3], pageAt-adder, Mparent)
else:
editer.addBookmark(titles[2], pageAt-adder, Kparent)
#merge page with fixed bookmarks
editer.merge((pageAt - adder), inputK)
#get titles and save them for future
lastMainTitle = mainTitle
lastSecondTitle = secondTitle
lastThirdTitle = thirdTitle
#go to next page
pageAt += 1
#get name for output file
nameOfFile = mainTitle + '.pdf'
print('Saving ' + nameOfFile)
#start new file and export it
outR = open(nameOfFile, 'wb')
editer.write(outR)
outR.close()
kStream.close()
现在,它会将所有书签放到那里,没有问题。但是如何处理超过750页。 我增加了递归限制和maxstdio ...但是,如果有1000页或更多的页面,则合并的文件为0字节,但是过程需要一两分钟,因此它正在处理。
我没有收到任何错误。
有人可以帮我处理500多页吗