我需要列出<product>
项中的所有元素,因为<product>
的元素是可变的。
XML文件:
<catalog>
<product>
<element1>text 1</element1>
<element2>text 2</element2>
<element..>text ..</element..>
</produc>
</catalog>
Python解析器: 我使用fast_iter因为我的xml文件很大......
import lxml.etree as etree
import configs.application as configs
myfile = configs.application.tmp + '/xml_hug_file.xml'
def fast_iter(context, func, *args, **kwargs):
for event, elem in context:
func(elem, *args, **kwargs)
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
del context
def process_element(catalog):
print("List all element of <product>")
context = etree.iterparse(myfile, tag='catalog', events = ('end', ))
fast_iter(context, process_element)
答案 0 :(得分:1)
这是我的问题的解决方案:
def process_element(catalog):
for product in catalog.findall('product'):
for element in product.findall('*'):
print(element.tag)
print(element.text)
答案 1 :(得分:1)
您可以使用XPath 'product/*[starts-with(local-name(),"element")]'
:
import lxml.etree as ET
import io
content = '''\
<catalog>
<product>
<element1>text 1</element1>
<element2>text 2</element2>
<element3>text ..</element3>
</product>
</catalog>'''
def fast_iter(context, func, *args, **kwargs):
"""
http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
Author: Liza Daly
See also http://effbot.org/zone/element-iterparse.htm
"""
for event, elem in context:
func(elem, *args, **kwargs)
# It's safe to call clear() here because no descendants will be
# accessed
elem.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in elem.xpath('ancestor-or-self::*'):
while ancestor.getprevious() is not None:
del ancestor.getparent()[0]
del context
def process_element(catalog):
for elt in catalog.xpath('product/*[starts-with(local-name(),"element")]'):
print(elt)
context = ET.iterparse(io.BytesIO(content), tag='catalog', events = ('end', ))
fast_iter(context, process_element)
产量
<Element element1 at 0xb7449374>
<Element element2 at 0xb744939c>
<Element element3 at 0xb74493c4>
顺便说一句,我对Liz Daly的fast_iter进行了更改,这将删除更多未使用的元素。这可以在解析大型XML文件时减少内存需求。
以下是一个示例,其中显示了上述修改后的fast_iter
如何删除比原始fast_iter
更多的元素:
import logging
import textwrap
import lxml.etree as ET
import io
logger = logging.getLogger(__name__)
level = logging.INFO
# level = logging.DEBUG # uncomment to see more debugging information
logging.basicConfig(level=level)
def fast_iter(context, func, *args, **kwargs):
"""
http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
Author: Liza Daly
See also http://effbot.org/zone/element-iterparse.htm
"""
for event, elem in context:
logger.debug('Processing {e}'.format(e=ET.tostring(elem)))
func(elem, *args, **kwargs)
# It's safe to call clear() here because no descendants will be
# accessed
logger.debug('Clearing {e}'.format(e=ET.tostring(elem)))
elem.clear()
# Also eliminate now-empty references from the root node to elem
for ancestor in elem.xpath('ancestor-or-self::*'):
logger.debug('Checking ancestor: {a}'.format(a=ancestor.tag))
while ancestor.getprevious() is not None:
logger.info('Deleting {p}'.format(
p=(ancestor.getparent()[0]).tag))
del ancestor.getparent()[0]
del context
def orig_fast_iter(context, func, *args, **kwargs):
for event, elem in context:
logger.debug('Processing {e}'.format(e=ET.tostring(elem)))
func(elem, *args, **kwargs)
logger.debug('Clearing {e}'.format(e=ET.tostring(elem)))
elem.clear()
while elem.getprevious() is not None:
logger.info('Deleting {p}'.format(
p=(elem.getparent()[0]).tag))
del elem.getparent()[0]
del context
def setup_ABC():
content = textwrap.dedent('''\
<root>
<A1>
<B1></B1>
<C>1<D1></D1></C>
<E1></E1>
</A1>
<A2>
<B2></B2>
<C>2<D></D></C>
<E2></E2>
</A2>
</root>
''')
return content
content = setup_ABC()
context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
orig_fast_iter(context, lambda elem: None)
# DEBUG:__main__:Deleting B1
# DEBUG:__main__:Deleting B2
print('-'*80)
"""
The improved fast_iter deletes A1. The original fast_iter does not.
"""
content = setup_ABC()
context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
fast_iter(context, lambda elem: None)
# DEBUG:__main__:Deleting B1
# DEBUG:__main__:Deleting A1
# DEBUG:__main__:Deleting B2
因此,您会看到修改后的fast_iter
设法删除A1
元素,因为在处理第二个C
元素时不需要它。原始fast_iter
仅删除C
元素的父项(即B
元素)。你可以想象A1
之类的东西在大型XML文件中可能会非常大,而且可能会有很多这样的元素。因此修改后的fast_iter
将允许回收原始fast_iter
无法释放的大量内存。
答案 2 :(得分:1)
def process_element(catalog, *args, **kwargs):
for child in catalog.getchildren():
print(child.text)