如何使用pdfminer python代码

时间:2016-03-25 08:10:26

标签: python-2.7

我想从文件夹转换多个pdf文件,并使用pdfminer将它们放在另一个文件夹中。我最初设法转换单个文件,但是,当我想转换多个文件时,它会从第二个转换的txt文件中的第一个pdf文件中写入内容。我写了以下代码。

from subprocess import Popen, PIPE

#http://stackoverflow.com/questions/5725278/python-help-using-pdfminer-as-a-library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os

import os


def convert_pdf_to_txt(path):

    listing=os.listdir(path)

    docid=[]
    for infile in listing:
        infile=os.path.join(path,infile)

        docid.append(infile)
        zz=docid
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = False
    pagenos=set()

    for kk in zz:
        ass=kk
        ap=file(ass,"rb")
        for page in PDFPage.get_pages(ap, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
            str = retstr.getvalue()           
        filename, file_extension=os.path.splitext(kk)
        splitted,files=os.path.split(filename)
        splittedd,pathd=os.path.split(splitted)
        yy=splittedd+'//'+'doc3'+'//'+files+'.'+'txt'
        zz=splittedd+'//'+'doc3'+'//'+files+'.'+'html'
        txtfileo=open(yy,'w')
        txtfileo.write(str)
        txtfileo.close()
        txtfileo1=open(zz,'w')
        txtfileo1.write(str)
        txtfileo1.close()
    retstr.close()            
    ap.close()
    device.close()

print convert_pdf_to_txt('amharicir \ docname1')

如果有人能帮到它,我很高兴

1 个答案:

答案 0 :(得分:0)

from subprocess import Popen, PIPE
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os

import os


def convert_pdf_to_txt(path):

    filelist=os.listdir(path)
    documentcollection=[]
    for files in filelist:
        files=os.path.join(path,files)
        documentcollection.append(files)
    for ifiles in documentcollection:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)    
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()
        ifilesid=file(ifiles,"rb")
        for page in PDFPage.get_pages(ifilesid, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        text = retstr.getvalue()

        filename, file_extension=os.path.splitext(ifiles)
        splitted,files=os.path.split(filename)
        splittedd,pathd=os.path.split(splitted)
        yy=splittedd+'//'+'doc3'+'//'+files+'.'+'txt'
        zz=splittedd+'//'+'doc3'+'//'+files+'.'+'html'
        txtfileo=open(yy,'w')
        txtfileo.write(text)
        txtfileo.close()
        txtfileo1=open(zz,'w')
        txtfileo1.write(text)
        txtfileo1.close()
    ifilesid.close()

    retstr.close()
    device.close()
    return text

print convert_pdf_to_txt('amharicir\\docname1')