首先我使用的是Python 3.5.1(32位版本) 我编写了以下程序,使用PyPDF2和reportlab在我的pdf文件的所有页面上添加页面编号:
#import modules
from os import listdir
from PyPDF2 import PdfFileWriter, PdfFileReader
import io
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
#initial values of variable declarations
PDFlist=[]
X_value=460
Y_value=820
#Make a list of al files in de directory
filelist = listdir()
#Make a list of all pdf files in the directory
for i in range(0,len(filelist)):
filename=filelist[i]
for j in range(0,len(filename)):
char=filename[j]
if char=='.':
extension=filename[j+1:j+4]
if extension=='pdf':
PDFlist.append(filename)
j=j+1
i=i+1
# Give the horizontal position for the page number (Enter = use default value of 480)
User = input('Give horizontal position page number (ENTER = default 460): ')
if User != "":
X_value=int(User)
# Give the vertical position for the page number (Enter = use default value of 820)
User = input('Give horizontal position page number (ENTER = default 820): ')
if User != "":
Y_value=int(User)
for i in range(0,len(PDFlist)):
filename=PDFlist[i]
# read the PDF
existing_pdf = PdfFileReader(open(filename, "rb"))
print("File: "+filename)
# count the number of pages
number_of_pages = existing_pdf.getNumPages()
print("Number of pages detected:"+str(number_of_pages))
output = PdfFileWriter()
for k in range(0,number_of_pages):
packet = io.BytesIO()
# create a new PDF with Reportlab
can = canvas.Canvas(packet, pagesize=A4)
Pagenumber=" Page "+str(k+1)+"/"+str(number_of_pages)
# we first make a white rectangle to cover any existing text in the pdf
can.setFillColorRGB(1,1,1)
can.setStrokeColorRGB(1,1,1)
can.rect(X_value-10,Y_value-5,120,20,fill=1)
# set the font and size
can.setFont("Helvetica",14)
# choose color of page numbers (red)
can.setFillColorRGB(1,0,0)
can.drawString(X_value, Y_value, Pagenumber)
can.save()
print(Pagenumber)
#move to the beginning of the StringIO buffer
packet.seek(0)
new_pdf = PdfFileReader(packet)
# add the "watermark" (which is the new pdf) on the existing page
page = existing_pdf.getPage(k)
page.mergePage(new_pdf.getPage(0))
output.addPage(page)
k=k+1
# finally, write "output" to a real file
ResultPDF="Output/"+filename
outputStream = open(ResultPDF, "wb")
output.write(outputStream)
outputStream.close()
i=i+1
此程序适用于大量PDF文件(虽然警告有时会生成类似于PdfReadWarning: Superfluous whitespace found in object header b'16' b'0' [pdf.py:1666]
'但结果输出文件对我来说没问题)。
但是,该程序对某些PDF文件不起作用,尽管这些文件使用我的Adobe Acrobat可以完全读取和编辑。我的印象是错误主要出现在扫描的PDF文件上,但不是全部(我也编号扫描的PDF文件没有产生任何错误)。
我收到以下错误消息(前8行是我自己的打印命令的结果):
File: Scanned file.pdf
Number of pages detected:6
Page 1/6
Page 2/6
Page 3/6
Page 4/6
Page 5/6
Page 6/6
PdfReadWarning: Object 25 1 not defined. [pdf.py:1629]
Traceback (most recent call last):
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\Sourcecode\PDFPager.py", line 83, in <module>
output.write(outputStream)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 482, in write
self._sweepIndirectReferences(externalReferenceMap, self._root)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 571, in _sweepIndirectReferences
self._sweepIndirectReferences(externMap, realdata)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 547, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, value)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 556, in _sweepIndirectReferences
value = self._sweepIndirectReferences(externMap, data[i])
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 577, in _sweepIndirectReferences
newobj = data.pdf.getObject(data)
File "C:\Users\User\AppData\Local\Programs\Python\Python35-32\lib\site-packages\PyPDF2\pdf.py", line 1631, in getObject
raise utils.PdfReadError("Could not find object.")
PyPDF2.utils.PdfReadError: Could not find object.
显然页面与reportlab创建的PDF合并(参见第6/6页的行),但最终PyPDF2无法生成输出PDF文件(我得到一个0字节的不可读输出文件)。 有人可以解释如何解决这个问题吗?我搜索了互联网,但无法找到答案。
答案 0 :(得分:1)
使用“ strict = false”使事情对我有用。
from PyPDF2 import PdfFileMerger
pdfs = [r'file 1.pdf', r'file 2.pdf']
merger = PdfFileMerger(strict=False)
for pdf in pdfs:
merger.append(pdf)
merger.write(r"thanks mate.pdf")
答案 1 :(得分:0)
在pdf.py上进行以下更改:
pdf第1633行。 py(表示取消注释if self.strict)
# keys
k = mylist[1::2]
# values
v = mylist[::2]
# dictionary
mydict = dict(zip(k, v))
并在pdf.py的第501行进行以下更改(添加尝试,但块除外)
if self.strict:
raise utils.PdfReadError("Could not find object.")
干杯。
答案 2 :(得分:0)
这是我的解决方案。尝试将文件写入虚拟ByteIO流中,以检查其是否损坏。
try:
reader = PdfFileReader(input_file)
print("Opening '{}', pages={}".format(file_path, reader.getNumPages()))
# Try to write it into an dummy ByteIO stream to check whether pdf is broken
writer = PdfFileWriter()
writer.addPage(reader.getPage(0))
writer.write(io.BytesIO())
except PdfReadError:
print("Error reading '{}".format(file_path))
continue