我正在学习用python编写代码。首先,我进行了一个练习,将pdf的toc转换为XML。练习变得比我想象的要困难。 这是我的代码:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pathlib import Path
from xml.etree.ElementTree import Element, SubElement
from xml.etree import ElementTree
from xml.dom import minidom
def prettify(elem):
"""Return a pretty-printed XML string for the Element.
"""
rough_string = ElementTree.tostring(elem, 'utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
pdfDoc = Path(r'<path of the pdf document>')
fp = open(pdfDoc, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
outlines = doc.get_outlines()
docLevel = 0
previouslevel = 0
docLevelList = dict()
prevCh = ""
xDoc = Element("root")
for (level, title, dest, a, se) in outlines:
if title == "Contents":
pass
else:
currCh = title.split(" ",1)[0]
if len(docLevelList) == 0 and level == 1:
docLevelList["ch_"+currCh] = "root"
previouslevel = level
prevCh = currCh
child = SubElement(xDoc, sorted(docLevelList.keys())[-1])
child.text = title.split(" ",1)[1]
elif len(docLevelList) != 0 and level == 1 and level < previouslevel:
docLevelList["ch_"+currCh] = "root"
previouslevel = level
prevCh = currCh
child = SubElement(xDoc, sorted(docLevelList.keys())[-1])
child.text = title.split(" ",1)[1]
elif len(docLevelList) != 0 and level == 1 and int(currCh) == int (prevCh) + 1:
docLevelList["ch_"+currCh] = "root"
previouslevel = level
prevCh = currCh
child = SubElement(xDoc, sorted(docLevelList.keys())[-1])
child.text = title.split(" ",1)[1]
#print(docLevelList, previouslevel, prevCh)
elif len(docLevelList) != 0 and level > previouslevel and prevCh in currCh:
docLevelList["ch_"+currCh] = "ch_"+prevCh
previouslevel = level
prevCh = currCh
child = SubElement(docLevelList.get(sorted(docLevelList.keys())[-1]), sorted(docLevelList.keys())[-1])
child.text = title.split(" ",1)[1]
elif len(docLevelList) != 0 and level == previouslevel and int(prevCh[-1]) + 1 == int(currCh[-1]):
docLevelList["ch_"+currCh] = docLevelList.get("ch_"+prevCh)
#docLevelList["ch_"+currCh] = list(docLevelList.keys())[list(docLevelList.values()).index("ch_"+prevCh)]
previouslevel = level
prevCh = currCh
child = SubElement(docLevelList.get(sorted(docLevelList.keys())[-1]), sorted(docLevelList.keys())[-1])
child.text = title.split(" ",1)[1]
print (prettify(xDoc))
SubElement调用中的docLevelList.get(sorted(docLevelList.keys())[-1])位正在创建TypeError。
从错误消息中,我了解到parent应该是一个元素对象,但是我的想法已经用完了。救命!!! 此外,任何其他改进建议也非常欢迎。 请原谅可悲的代码。