我使用下面的代码将PDF数据转换为XML数据,并将转换写入XML文件。它是众所周知的(它使用PDFminer模块),并且对于PDF到文本和HTML的转换非常有效,但是当我执行PDF到XML的转换时遇到了问题。我是一个新手,您的一些帮助会非常好:)
语音代码:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter, XMLConverter, HTMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import BytesIO
def convert_pdf(path, format='text', codec='utf-8', password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
laparams = LAParams()
if format == 'text':
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'html':
device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
elif format == 'xml':
device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
else:
raise ValueError('provide format, either text, html or xml!')
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
text = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
return text
path_pdf = ...
path_xml = ...
open(path_xml, "w").close()
text_output = convert_pdf(path_pdf)
open(path_xml, "a", encoding="utf-8").write(text_output)
这是我得到的错误:
提前谢谢!
答案 0 :(得分:0)
如果你使用的是python2.7,下面的工作
from io import BytesIO
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
def get_xml_py2(file_path):
in_fp = BytesIO()
with open(file_path, 'rb') as x:
in_fp.write(x.read())
laparams = LAParams(all_texts=True)
rsrcmgr = PDFResourceManager()
for page in PDFPage.get_pages(in_fp):
outfp = BytesIO()
device = XMLConverter(rsrcmgr, outfp, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
interpreter.process_page(page)
yield outfp.getvalue()
device.close()
outfp.close()
in_fp.close()