我是使用PDFMiner从PDF提取数据的,然后才使用pdftotext来输出以下格式的数据
<flow>
<block xMin="257.682878" yMin="327.645732" xMax="267.436664" yMax="353.154018">
<line xMin="257.682878" yMin="327.645732" xMax="267.436664" yMax="353.154018">
<word xMin="257.682878" yMin="344.226468" xMax="267.429662" yMax="353.154018">MR</word>
<word xMin="257.682878" yMin="327.645732" xMax="267.436664" yMax="342.630012">(WDI)</word>
</line>
</block>
</flow>
如何在PDFMiner中应用相同的内容? 当前正在使用的方法如下
def parse_obj(lt_objs):
# loop over the object list
for obj in lt_objs:
# if it's a textbox, print text and location
if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
txt = obj.get_text().replace('\n', '_')
txt = txt.rsplit('_', 1)[0]
if txt.endswith('_') and txt.count('_') == 1:
{}
elif txt.count('_') >1 :
txt = ''.join(reversed(txt))
txt = txt.replace('_', '')
# print(obj)
data.append((obj.bbox[0], obj.bbox[1], txt))
print("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], txt))
# elif isinstance(obj,pdfminer.layout.LTTextBoxVerticdal):
# txt = obj.get_text().replace('\n', '_')
# if '_' in txt:
# txt=''.join(reversed(txt))
# if it's a container, recurse
elif isinstance(obj, pdfminer.layout.LTFigure):
parse_obj(obj._objs)
# loop over all pages in the document
pages = PDFPage.create_pages(document)
for page_number, page in enumerate(pages):
if page_number==16:
interpreter.process_page(page)
layout = device.get_result()
# extract text from this object
parse_obj(layout._objs)