我想从文件夹转换多个pdf文件,并使用pdfminer将它们放在另一个文件夹中。我最初设法转换单个文件,但是,当我想转换多个文件时,它会从第二个转换的txt文件中的第一个pdf文件中写入内容。我写了以下代码。
from subprocess import Popen, PIPE
#http://stackoverflow.com/questions/5725278/python-help-using-pdfminer-as-a-library
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os
import os
def convert_pdf_to_txt(path):
listing=os.listdir(path)
docid=[]
for infile in listing:
infile=os.path.join(path,infile)
docid.append(infile)
zz=docid
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = False
pagenos=set()
for kk in zz:
ass=kk
ap=file(ass,"rb")
for page in PDFPage.get_pages(ap, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
str = retstr.getvalue()
filename, file_extension=os.path.splitext(kk)
splitted,files=os.path.split(filename)
splittedd,pathd=os.path.split(splitted)
yy=splittedd+'//'+'doc3'+'//'+files+'.'+'txt'
zz=splittedd+'//'+'doc3'+'//'+files+'.'+'html'
txtfileo=open(yy,'w')
txtfileo.write(str)
txtfileo.close()
txtfileo1=open(zz,'w')
txtfileo1.write(str)
txtfileo1.close()
retstr.close()
ap.close()
device.close()
print convert_pdf_to_txt('amharicir \ docname1')
如果有人能帮到它,我很高兴
答案 0 :(得分:0)
from subprocess import Popen, PIPE
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os
import os
def convert_pdf_to_txt(path):
filelist=os.listdir(path)
documentcollection=[]
for files in filelist:
files=os.path.join(path,files)
documentcollection.append(files)
for ifiles in documentcollection:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
ifilesid=file(ifiles,"rb")
for page in PDFPage.get_pages(ifilesid, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue()
filename, file_extension=os.path.splitext(ifiles)
splitted,files=os.path.split(filename)
splittedd,pathd=os.path.split(splitted)
yy=splittedd+'//'+'doc3'+'//'+files+'.'+'txt'
zz=splittedd+'//'+'doc3'+'//'+files+'.'+'html'
txtfileo=open(yy,'w')
txtfileo.write(text)
txtfileo.close()
txtfileo1=open(zz,'w')
txtfileo1.write(text)
txtfileo1.close()
ifilesid.close()
retstr.close()
device.close()
return text
print convert_pdf_to_txt('amharicir\\docname1')