我找到了一些pdf数据提取代码from a user here on stackoverflow。但是看一下输出,它逐列提取。有没有办法让pdfminer.six逐行读取数据?
这是我使用的代码(与原始代码相比略有修改,删除了注释以便于阅读)。 Here is also a screenshot from the current output with an example pdf
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
fp = open('test.pdf', 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()
device = PDFDevice(rsrcmgr)
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
def parse_obj(lt_objs):
for obj in lt_objs:
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
print("{}".format(obj.get_text().replace("\n", "")))
elif isinstance(obj, pdfminer.layout.LTFigure):
parse_obj(obj._objs)
for page in PDFPage.create_pages(document):
interpreter.process_page(page)
layout = device.get_result()
parse_obj(layout._objs)
提前致谢。