我正在尝试从以下PDF中提取文本。
首先,我使用了PyPDF2
库。我使用了以下代码:
from PyPDF2 import PdfFileReader
pdf = PdfFileReader('March_5_8000/1.pdf',strict=False)
information = pdf.getDocumentInfo()
这将引发以下错误:
/opt/conda/envs/fastai/lib/python3.6/site-packages/PyPDF2/pdf.py in getObject(self, indirectReference)
1597 if debug: print((" Uncompressed Object", indirectReference.idnum, indirectReference.generation, ":", start))
1598 self.stream.seek(start, 0)
-> 1599 idnum, generation = self.readObjectHeader(self.stream)
1600 if idnum != indirectReference.idnum and self.xrefIndex:
1601 # Xref table probably had bad indexes due to not being zero-indexed
/opt/conda/envs/fastai/lib/python3.6/site-packages/PyPDF2/pdf.py in readObjectHeader(self, stream)
1665 warnings.warn("Superfluous whitespace found in object header %s %s" % \
1666 (idnum, generation), utils.PdfReadWarning)
-> 1667 return int(idnum), int(generation)
1668
1669 def cacheGetIndirectObject(self, generation, idnum):
ValueError: invalid literal for int() with base 10: b'obj'
我也尝试使用pdfrw
库,但是返回了相同的错误:
代码:
from pdfrw import PdfReader
x = PdfReader('March_5_8000/980.pdf')
x.keys()
错误:
PdfParseError Traceback (most recent call last)
<ipython-input-6-6d3575671b6c> in <module>
1 from pdfrw import PdfReader
----> 2 x = PdfReader('March_5_8000/980.pdf')
3 x.keys()
/opt/conda/envs/fastai/lib/python3.6/site-packages/pdfrw/pdfreader.py in __init__(self, fname, fdata, decompress, decrypt, password, disable_gc, verbose)
617 while 1:
618 source.obj_offsets = {}
--> 619 trailer, is_stream = self.parsexref(source)
620 prev = trailer.Prev
621 if prev is None:
/opt/conda/envs/fastai/lib/python3.6/site-packages/pdfrw/pdfreader.py in parsexref(self, source)
463 return self.readdict(source), False
464 else:
--> 465 source.exception('Expected "xref" keyword or xref stream object')
466
467 def readpages(self, node):
/opt/conda/envs/fastai/lib/python3.6/site-packages/pdfrw/tokens.py in exception(self, *arg)
227
228 def exception(self, *arg):
--> 229 raise PdfParseError(self.msg(*arg))
PdfParseError: Expected "xref" keyword or xref stream object (line=2238, col=39, token='obj')
我该如何解决?