使用pdfminer从文件夹转换多个PDF文件,根据其坐标提取文本

时间:2018-05-25 01:00:19

标签: python python-2.7 coordinates nested-lists pdfminer

我想根据它的坐标提取文本,使用pdfminer从文件夹转换多个PDF文件,并将结果存储到列表或字典中。我最初设法转换单个文件,并能够根据其坐标提取文本。

我必须说我不擅长Python,但我正在努力。真的很感激,如果有人可以告诉我如何修复以下代码:

import os
import glob
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
from cStringIO import StringIO


directory = os.path.abspath('./')
pdfFiles = glob.glob(os.path.join(directory, '*.pdf'))

resourceManager = PDFResourceManager()
returnString = StringIO()
codec = 'utf-8'
laParams = LAParams()
device = PDFPageAggregator(resourceManager, laparams=laParams)
interpreter = PDFPageInterpreter(resourceManager, device)


maxPages = 0
caching = True
pageNums=set()

for one_pdf in pdfFiles:
    print("Processing file: " + str(one_pdf))
    fp = file(one_pdf, 'rb')
    name = "one_pdf"

    lst =[]
    def parse_obj(lt_objs):
        for one_pdf in enumerate(pdfFiles):
            for obj in lt_objs:
                if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                    print "%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_'))
                    lst.append([one_pdf,[[obj.bbox[0],obj.bbox[1]], obj.get_text()]])

                elif isinstance(obj, pdfminer.layout.LTFigure):
                    parse_obj(obj._objs)

    for page in PDFPage.get_pages(fp, pageNums, maxpages=maxPages,caching=caching, check_extractable=True):

            interpreter.process_page(page)
            layout = device.get_result()

            parse_obj(layout._objs)

device.close()
returnString.close()

代码输出给定文件夹中PDF的名称,后跟PDF文本布局。这是我无法将结果成功存储到嵌套列表或字典字典中的位置。

1 个答案:

答案 0 :(得分:1)

我想我已经想出了如何将pdf名称作为键及其内容以及内容的坐标存储到字典中(pdf_dict)。

lst=[]
dicts_from_file =[]
pdf_dict = {}

for one_pdf in pdfFiles:
#print("Processing file: " + str(one_pdf))
fp = file(one_pdf, 'rb')
name = str(one_pdf)
head, tail = os.path.split(name)
dicts_from_file.append({tail})

def parse_obj(lt_objs):
    for obj in lt_objs:
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            lst.append([[obj.bbox[0],obj.bbox[1]],obj.get_text().replace('\n', '')])
            pdf_dict.update( {tail : lst} )


        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs)
            return []

for page in PDFPage.get_pages(fp, pageNums, maxpages=maxPages, password=password,caching=caching, check_extractable=True):
    interpreter.process_page(page)
    layout = device.get_result()

    parse_obj(layout._objs)

device.close()
returnString.close()