我正在处理将所有PDF文件都转换为文本的问题,转换后我无法在将内容保存到文本文件中的同时获得空格。
AMITY UNIVERSITY 420乌迪格·维哈尔(Udyog Vihar),第四阶段,古拉姆(Gurugram)122016,哈里亚纳邦,印度电话:+ 91 124391
我希望印度和电话号码之间有一个空格
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
import pdfminer
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
import os
import tempfile, subprocess
#converting pdf to text
pdfDir = "C:\\user\\IRS\\Documents\\6to11thfeb\\spi\\"
def convert(fname, pages = None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close
return(text)
def convertMultiple(pdfDir, txtDir):
for pdf in os.listdir(pdfDir):
fileExtension = pdf.split(".")[-1]
if fileExtension == "pdf":
pdfFilename = pdfDir + pdf
text = convert(pdfFilename)
textFilename = txtDir + pdf.split(".")[0] + ".txt"
textFile = open(textFilename, "w", encoding = "utf-8")
textFile.write(text + '\t')
# textFile.close()
# print(text)
# textFile.write(text + '\t')
# print(textFile)
pdfDir = "C:\\Users\\IRIJFE\\Documents\\6to11thfeb\\spi\\"
txtDir = "C:\\Users\\IRIJFE\\Documents\\6to11thfeb\\spi\\"
convertMultiple(pdfDir, txtDir)