我的文件通常超过8GB,所以我使用了Liza Daly的fast_iter解决方案。但是,差不多2天后它还没有完成。我通过cProfile发现xpath占用了将近一半的时间。我怎样才能加快它?
以下是我的pdml文件示例:
<pdml>
<packet>
<proto name="geninfo">
<field name="timestamp" value="1"/>
</proto>
<proto name="frame">
<field name="b1"/>
</proto>
</packet>
</pdml>
这是我的解决方案:
from lxml import etree
context = etree.iterparse(infilename,tag='packet', huge_tree=True)
my_dict = fast_iter(context, process_packet)
def fast_iter(context, func, *args, **kwargs):
my_dict = {}
for event, elem in context:
geninfo = elem.xpath("proto[@name='geninfo']")
frame = elem.xpath("proto[@name='frame']")
if not geninfo or not frame:
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
continue
item = func(elem, *args, **kwargs)
elem.clear()
if not my_dict.has_key(item[0]):
my_dict[item[0]]=[]
my_dict[item[0]].append(item[1])
while elem.getprevious() is not None:
del elem.getparent()[0]
del context
return my_dict
def process_packet(elem):
temp_list = None
timestamp = unicode(elem.xpath("proto[@name='geninfo']/field[@name='timestamp']/@value")[0])
...
temp_list = [timestamp, ...]
return temp_list