使用pdfminer将pdf分隔为页面

时间:2016-07-03 17:34:52

标签: python pdfminer

我试图逐页提取pdf并将结果存储在字典中,如下所示:

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import re

def convert_pdf_to_txt(path):
    ps=dict()
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    i=1
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
        text = retstr.getvalue()
        ps[i]=re.sub(' +',' ',text)
        i+=1
    return ps

print convert_pdf_to_txt('Aak.pdf')[3]

但无论我访问哪个页面,我都会获得之前的所有页面。请告诉我如何解决这个问题?

1 个答案:

答案 0 :(得分:0)

这应该有效。

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO

import os

def set_interpreter():
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return { 'retstr': retstr, 'device': device, 'interpreter': interpreter }

def convert_pdf_to_txt(path):
    fp = file(path, 'rb')
    si = set_interpreter()
    retstr = si['retstr']
    device = si['device']
    interpreter = si['interpreter']
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    page_counter = 0

    for pageNumber, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True)):
        interpreter.process_page(page)
        fpp = file('pagetext_%d.txt' % page_counter, 'w+')
        fpp.write(retstr.getvalue())
        fpp.close()
        page_counter += 1
        si = set_interpreter()
        retstr = si['retstr']
        device = si['device']
        interpreter = si['interpreter']

    fp.close()
    device.close()
    retstr.close()
    return text

print convert_pdf_to_txt(os.path.dirname(os.path.realpath('filename.pdf')) + "/filename.pdf")