我一直在尝试使用以下代码从pdf文件中提取表数据。
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox,LTChar, LTFigure
import sys
class PdfMinerWrapper(object):
"""
Usage:
with PdfMinerWrapper('2009t.pdf') as doc:
for page in doc:
#do something with the page
"""
def __init__(self, pdf_doc, pdf_pwd=""):
self.pdf_doc = pdf_doc
self.pdf_pwd = pdf_pwd
def __enter__(self):
#open the pdf file
self.fp = open(self.pdf_doc, 'rb')
# create a parser object associated with the file object
parser = PDFParser(self.fp)
# create a PDFDocument object that stores the document structure
doc = PDFDocument(parser, password=self.pdf_pwd)
# connect the parser and document objects
parser.set_document(doc)
self.doc=doc
return self
def _parse_pages(self):
rsrcmgr = PDFResourceManager()
laparams = LAParams(char_margin=3.5, all_texts = True)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(self.doc):
interpreter.process_page(page)
# receive the LTPage object for this page
layout = device.get_result()
# layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc.
yield layout
def __iter__(self):
return iter(self._parse_pages())
def __exit__(self, _type, value, traceback):
self.fp.close()
def main():
with PdfMinerWrapper(sys.argv[1]) as doc:
for page in doc:
print 'Page no.', page.pageid, 'Size', (page.height, page.width)
for tbox in page:
if not isinstance(tbox, LTTextBox):
continue
print ' '*1, 'Block', 'bbox=(%0.2f, %0.2f, %0.2f, %0.2f)'% tbox.bbox
for obj in tbox:
print ' '*2, obj.get_text().encode('UTF-8')[:-1], '(%0.2f, %0.2f, %0.2f, %0.2f)'% tbox.bbox
for c in obj:
if not isinstance(c, LTChar):
continue
print c.get_text().encode('UTF-8'), '(%0.2f, %0.2f, %0.2f, %0.2f)'% c.bbox, c.fontname, c.size,
print
if __name__=='__main__':
main()
当我尝试运行该文件时,我收到以下错误:
python parse.py "/home/hp/AlgoLeap/poextracter/learning/pdfta
bles/pdf_set/sample1.pdf"
Traceback (most recent call last):
File "parse.py", line 69, in <module>
main()
File "parse.py", line 52, in main
for page in doc:
File "parse.py", line 39, in _parse_pages
interpreter.process_page(page)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 836, in process_page
self.render_contents(page.resources, page.contents, ctm=ctm)
File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 848, in render_contents
self.execute(list_value(streams))
File "/usr/local/lib/python2.7/dist-packages/pdfminer/pdfinterp.py", line 878, in execute
raise PDFInterpreterError('Unknown operator: %r' % name)
pdfminer.pdfinterp.PDFInterpreterError: Unknown operator: 'x\x9c\x95Z'
我一直试图从这个逻辑中获得输出。任何人都可以帮我解决这个问题。