我坚持将XML解析为类似Pandas的数据帧的列表。
# -*- coding: utf-8 -*-
import pandas as pd
"""
It's very important for parsing!
"""
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import xml.etree.cElementTree as ET
from table import Table
def xml_to_pd(xml):
tree = ET.fromstring(xml)
xmltag = tree.tag
doc_dict = {}
res = []
for doc in tree.iter():
if doc.attrib:
if doc_dict not in res and len(doc_dict)>0:
res.append(doc_dict)
doc_dict = {}
doc_dict = (doc.attrib)
if doc.text:
key = doc.tag
value = doc.text
doc_dict[key] = (value)
else:
if doc.text:
key = doc.tag
value = doc.text
doc_dict[key] = (value)
else:
if doc_dict != {}:
if doc_dict not in res:
res.append(doc_dict)
doc_dict = {}
if doc_dict != {}:
if doc_dict not in res:
res.append(doc_dict)
doc_dict = {}
df = pd.DataFrame(res)
return df
table = pd.DataFrame()
allxml = ['<markets><market id="1">MMVB</market><market id="4">FORTS</market><market id="15">ETS</market></markets>',
'<sec_info_upd><secid>1538</secid><seccode>SV16BL5</seccode><market>4</market><bgo_c>11908.97</bgo_c><bgo_nc>10307.27</bgo_nc><bgo_buy>4789.49</bgo_buy></sec_info_upd>',
'<quotes><quote secid="3630"><board>FUT</board><seccode>SiZ5</seccode><price>68079</price><buy>-1</buy></quote><quote secid="3630"><board>FUT</board><seccode>SiZ5</seccode><price>68132</price><buy>2</buy></quote></quotes>']
for xml in allxml:
res = xml_to_pd(xml)
for r in res:
table = pd.concat([table, res])
print '\n\r'
print table
我的想法是从每个XML表达式构建一个表,但我得到了奇怪的混合结果,并且不太确定我是否正确地执行。
请不要关心Pandas,实际上我将使用另一个轻量级存储表对象接受像pandas DataFrame那样的dicts列表。
这也是非常关键的时间,因为xml Feed是每10毫升从股票市场提供的。所以,问题是:我该如何正确而快速地做到这一点?
真的需要你的帮助,因为我完全陷入了这个xml地狱。 提前谢谢。
答案 0 :(得分:0)
这是我(丑陋)的解决方案:
def xml_to_pd(self, xml):
tree = ET.fromstring(xml)
self.xmltag = tree.tag
doc_dict = {}
res = []
for doc in tree.iter():
if doc.tag in doc_dict.keys():
res.append (doc_dict)
doc_dict = {}
if doc.attrib:
if doc_dict!= {} and doc_dict not in res:
for k in doc.attrib.keys():
if k in doc_dict:
res.append (doc_dict)
doc_dict = {}
doc_dict.update (doc.attrib)
else:
doc_dict.update(doc.attrib)
if doc.text:
key = doc.tag
value = doc.text
doc_dict[key] = value
if doc_dict not in res and doc_dict != {}:
res.append (doc_dict)
doc_dict = {}
return res
Table() is my custom 'dataframe' class.
更新一年。
def xml_to_dict(self, xmltext):
doc = xmltodict.parse(xmltext)
_res = {}
for key, value in doc.iteritems():
if isinstance(value, dict):
stripped_dict = {}
for subkey in value.keys():
stripped_key = subkey.strip('@')
stripped_dict[stripped_key] = value[subkey]
_res.update(stripped_dict)
else:
_res[key] = value
return _res