PyPDF3合并限制解决方法

时间:2018-09-24 22:37:16

标签: python stack pypdf pypdf2

我正在尝试合并1000多个pdf页面,并且可以在750个页面以下使用。如果我打开的数量超过750,它将对其进行处理,但输出文件为0字节。

 from PyPDF3 import PdfFileWriter, PdfFileReader, PdfFileMerger
 import os
 import sys
 from collections import OrderedDict
 import win32file

 win32file._setmaxstdio(8192)

 print(win32file._getmaxstdio())

 sys.setrecursionlimit(30000)

 nameOfFile = os.path.basename(os.getcwd())

 #get page number
 def getPageNr(arg1):
    stro = str(arg1)
    stro=stro.replace('.pdf', '')
    listR = stro.split(' - ')
    listR[len(listR)-1] = listR[len(listR)-1].replace('-','')
    listR[len(listR)-1] = listR[len(listR)-1].replace('Page ','')
    pgNr=int(listR[len(listR)-1])
    return pgNr

currentFolder = os.getcwd()

pdffiles = [os.path.join(name)
             for root, dirs, files in os.walk(currentFolder)
             for name in files
             if name.endswith((".pdf"))]

#create dictionary and get whole list
di={}

#direct copy and create key from page number on back and value is original list
for string in pdffiles:
    di.setdefault(getPageNr(string),str(string))

#sort it by keys
di2 = OrderedDict(sorted(di.items()))

pdffiles.clear()

for key,values in di2.items():
    pdffiles.append(values) 

#put a correction 
pageAt = 0
adder = 421
pageAt = pageAt + adder

#add global variables for page in bookmark
mainTitlePage = 0
secondTitlePage = 0
thirdTitlePage = 0

#define globals for bookmarks
mainTitle = ''
SecondTitle = ''
thirdTitle = ''

#define previous bookmarks
lastMainTitle = ''
lastSecondTitle = ''
lastThirdTitle = ''

#if main title is same as next page
isSame = True

#start Merger
editer = PdfFileMerger()

#start main loop
while pageAt<(adder+2000) and pageAt<len(pdffiles) and isSame:

    #break filename to titles
    titles = pdffiles[pageAt].split(' - ')

    #break next page for titles
    titlesNext = pdffiles[pageAt+1].split(' - ')

    #get titles
    mainTitle = titles[0]
    secondTitle = titles[1]

    if not titlesNext[0] == mainTitle:
        isSame = False

    hasThird = False
    if len(titles)>4:
        thirdTitle = titles[2]
        hasThird = True

    else:
        thirdTitle = None
        hasThird = False

    #open individual page
    kStream = open(pdffiles[pageAt], 'rb')  
    inputK = PdfFileReader(kStream)

    #test if titles are changing
    if not mainTitle == lastMainTitle:
        KmainParent = editer.addBookmark(mainTitle, 0) 

    if not secondTitle == lastSecondTitle:
        secondTitlePage = pageAt-adder
        #print(secondTitle)
        Kparent = editer.addBookmark(secondTitle, secondTitlePage, KmainParent)

    if hasThird:    
        if not thirdTitle == lastThirdTitle:
            thirdTitlePage = pageAt-adder
            Mparent = editer.addBookmark(thirdTitle, thirdTitlePage, Kparent)   

        editer.addBookmark(titles[3], pageAt-adder, Mparent)
    else:
        editer.addBookmark(titles[2], pageAt-adder, Kparent)

    #merge page with fixed bookmarks
    editer.merge((pageAt - adder), inputK)

 #get titles and save them for future 
    lastMainTitle = mainTitle
    lastSecondTitle = secondTitle
    lastThirdTitle = thirdTitle

    #go to next page
    pageAt += 1

#get name for output file
nameOfFile = mainTitle + '.pdf'
print('Saving ' + nameOfFile)

#start new file and export it 
outR = open(nameOfFile, 'wb')
editer.write(outR)      

outR.close()
kStream.close()

现在,它会将所有书签放到那里,没有问题。但是如何处理超过750页。 我增加了递归限制和maxstdio ...但是,如果有1000页或更多的页面,则合并的文件为0字节,但是过程需要一两分钟,因此它正在处理。

我没有收到任何错误。

有人可以帮我处理500多页吗

0 个答案:

没有答案