将pdf的目录转换为xml

时间:2018-11-08 12:07:36

标签: python-3.x pdf

我正在学习用python编写代码。首先,我进行了一个练习,将pdf的toc转换为XML。练习变得比我想象的要困难。 这是我的代码:

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pathlib import Path
from xml.etree.ElementTree import Element, SubElement
from xml.etree import ElementTree
from xml.dom import minidom

def prettify(elem):
    """Return a pretty-printed XML string for the Element.
    """
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ")

pdfDoc = Path(r'<path of the pdf document>')
fp = open(pdfDoc, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)

outlines = doc.get_outlines()
docLevel = 0
previouslevel = 0
docLevelList = dict()
prevCh = ""
xDoc = Element("root")
for (level, title, dest, a, se) in outlines:
    if title == "Contents":
        pass
    else:
        currCh = title.split(" ",1)[0]

        if len(docLevelList) == 0 and  level == 1:
            docLevelList["ch_"+currCh] = "root"
            previouslevel = level
            prevCh = currCh
            child = SubElement(xDoc, sorted(docLevelList.keys())[-1])
            child.text = title.split(" ",1)[1]

        elif len(docLevelList) != 0 and level == 1 and level < previouslevel:
            docLevelList["ch_"+currCh] = "root"
            previouslevel = level
            prevCh = currCh
            child = SubElement(xDoc, sorted(docLevelList.keys())[-1])
            child.text = title.split(" ",1)[1]

        elif len(docLevelList) != 0 and level == 1 and int(currCh) == int (prevCh) + 1:
            docLevelList["ch_"+currCh] = "root"
            previouslevel = level
            prevCh = currCh
            child = SubElement(xDoc, sorted(docLevelList.keys())[-1])
            child.text = title.split(" ",1)[1]
            #print(docLevelList, previouslevel, prevCh)

        elif len(docLevelList) != 0 and level > previouslevel and prevCh in currCh:
            docLevelList["ch_"+currCh] = "ch_"+prevCh
            previouslevel = level
            prevCh = currCh
            child = SubElement(docLevelList.get(sorted(docLevelList.keys())[-1]), sorted(docLevelList.keys())[-1])
            child.text = title.split(" ",1)[1]

        elif len(docLevelList) != 0 and level == previouslevel and int(prevCh[-1]) + 1 == int(currCh[-1]):
            docLevelList["ch_"+currCh] = docLevelList.get("ch_"+prevCh)
            #docLevelList["ch_"+currCh] = list(docLevelList.keys())[list(docLevelList.values()).index("ch_"+prevCh)]
            previouslevel = level
            prevCh = currCh
            child = SubElement(docLevelList.get(sorted(docLevelList.keys())[-1]), sorted(docLevelList.keys())[-1])
            child.text = title.split(" ",1)[1]

print (prettify(xDoc))

SubElement调用中的docLevelList.get(sorted(docLevelList.keys())[-1])位正在创建TypeError。

从错误消息中,我了解到parent应该是一个元素对象,但是我的想法已经用完了。救命!!! 此外,任何其他改进建议也非常欢迎。 请原谅可悲的代码。

0 个答案:

没有答案