使用python

时间:2019-04-26 10:52:19

标签: python

我正在处理将所有PDF文件都转换为文本的问题,转换后我无法在将内容保存到文本文件中的同时获得空格。

AMITY UNIVERSITY 420乌迪格·维哈尔(Udyog Vihar),第四阶段,古拉姆(Gurugram)122016,哈里亚纳邦,印度电话:+ 91 124391

我希望印度和电话号码之间有一个空格

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
import pdfminer
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
import os
import tempfile, subprocess


#converting pdf to text


pdfDir = "C:\\user\\IRS\\Documents\\6to11thfeb\\spi\\"

def convert(fname, pages = None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)


    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return(text)

def convertMultiple(pdfDir, txtDir):
    for pdf in os.listdir(pdfDir):
        fileExtension = pdf.split(".")[-1]
        if fileExtension == "pdf":
            pdfFilename = pdfDir + pdf
            text = convert(pdfFilename)
            textFilename = txtDir + pdf.split(".")[0] + ".txt"
            textFile = open(textFilename, "w", encoding = "utf-8")
            textFile.write(text + '\t')
#            textFile.close()
#            print(text) 
#            textFile.write(text + '\t')
#            print(textFile)

pdfDir = "C:\\Users\\IRIJFE\\Documents\\6to11thfeb\\spi\\"
txtDir = "C:\\Users\\IRIJFE\\Documents\\6to11thfeb\\spi\\"

convertMultiple(pdfDir, txtDir)   

0 个答案:

没有答案