我想使用lxml lib将xml转换为dict。
但是我有半结构化XML,例如:
<doc>This is<i>xmlobj</i> documentation.<b> SUPER</b> great</doc>
如果我这样做:
import xml.etree.cElementTree as ET
from collections import defaultdict
import time
my_file = "aaa.xml"
def etree_to_dict(t):
d = {t.tag: {} if t.attrib else None}
children = list(t)
if children:
dd = defaultdict(list)
for dc in map(etree_to_dict, children):
for k, v in dc.items():
dd[k].append(v)
d = {t.tag: {k: v[0] if len(v) == 1 else v
for k, v in dd.items()}}
if t.attrib:
d[t.tag].update(('@' + k, v)
for k, v in t.attrib.items())
if t.text:
text = t.text.strip()
if children or t.attrib:
if text:
d[t.tag]['#text'] = text
else:
d[t.tag] = text
return d
from lxml import etree as ET
count = 0
p=True
tree = ET.parse(my_file)
root = tree.getroot()
#iter = root.iter('PubmedArticle')
iter = root.iter('example')
for entry in iter:
count+= 1
if p:
p=False
r = etree_to_dict(entry)
print(r)
print(count)
我得到:
{'example': {'doc': {'i': 'xmlobj', 'b': 'SUPER', '#text': 'This is'}, 'a': 'Oui'}}
“伟大”缺失
我尝试实现自己的treebuilder:
from lxml import etree as ET
parser = ET.XMLParser( target = myTreeBuilder())
tree = ET.parse(my_file,parser)
r = etree_to_dict(tree)
但是我不知道我应该如何实现它。
默认的TreeBuilder在那里:https://github.com/python/cpython/blob/master/Lib/xml/etree/ElementTree.py