Python xml:列出item中的所有元素

时间:2013-06-25 13:46:15

标签: python xml python-3.x lxml

我需要列出<product>项中的所有元素,因为<product>的元素是可变的。

XML文件:

<catalog>
   <product>
      <element1>text 1</element1>
      <element2>text 2</element2>
      <element..>text ..</element..>
   </produc>
</catalog>

Python解析器: 我使用fast_iter因为我的xml文件很大......

import lxml.etree as etree
import configs.application as configs

myfile = configs.application.tmp + '/xml_hug_file.xml'

def fast_iter(context, func, *args, **kwargs):
    for event, elem in context:
        func(elem, *args, **kwargs)
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context

def process_element(catalog):
    print("List all element of <product>")

context = etree.iterparse(myfile, tag='catalog', events = ('end', ))
fast_iter(context, process_element)

3 个答案:

答案 0 :(得分:1)

这是我的问题的解决方案:

def process_element(catalog):
    for product in catalog.findall('product'):
        for element in product.findall('*'):
            print(element.tag)
            print(element.text)

答案 1 :(得分:1)

您可以使用XPath 'product/*[starts-with(local-name(),"element")]'


import lxml.etree as ET
import io

content = '''\
<catalog>
   <product>
      <element1>text 1</element1>
      <element2>text 2</element2>
      <element3>text ..</element3>
   </product>
</catalog>'''

def fast_iter(context, func, *args, **kwargs):
    """
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    Author: Liza Daly
    See also http://effbot.org/zone/element-iterparse.htm
    """
    for event, elem in context:
        func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
    del context


def process_element(catalog):
    for elt in catalog.xpath('product/*[starts-with(local-name(),"element")]'):
        print(elt)

context = ET.iterparse(io.BytesIO(content), tag='catalog', events = ('end', ))
fast_iter(context, process_element)

产量

<Element element1 at 0xb7449374>
<Element element2 at 0xb744939c>
<Element element3 at 0xb74493c4>

顺便说一句,我对Liz Daly的fast_iter进行了更改,这将删除更多未使用的元素。这可以在解析大型XML文件时减少内存需求。

以下是一个示例,其中显示了上述修改后的fast_iter如何删除比原始fast_iter更多的元素:

import logging
import textwrap
import lxml.etree as ET
import io

logger = logging.getLogger(__name__)
level = logging.INFO
# level = logging.DEBUG  # uncomment to see more debugging information
logging.basicConfig(level=level)

def fast_iter(context, func, *args, **kwargs):
    """
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    Author: Liza Daly
    See also http://effbot.org/zone/element-iterparse.htm
    """
    for event, elem in context:
        logger.debug('Processing {e}'.format(e=ET.tostring(elem)))
        func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        logger.debug('Clearing {e}'.format(e=ET.tostring(elem)))
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            logger.debug('Checking ancestor: {a}'.format(a=ancestor.tag))
            while ancestor.getprevious() is not None:
                logger.info('Deleting {p}'.format(
                    p=(ancestor.getparent()[0]).tag))
                del ancestor.getparent()[0]
    del context

def orig_fast_iter(context, func, *args, **kwargs):
    for event, elem in context:
        logger.debug('Processing {e}'.format(e=ET.tostring(elem)))
        func(elem, *args, **kwargs)
        logger.debug('Clearing {e}'.format(e=ET.tostring(elem)))
        elem.clear()
        while elem.getprevious() is not None:
            logger.info('Deleting {p}'.format(
                p=(elem.getparent()[0]).tag))                
            del elem.getparent()[0]
    del context

def setup_ABC():
    content = textwrap.dedent('''\
      <root>
        <A1>
          <B1></B1>
          <C>1<D1></D1></C>
          <E1></E1>
        </A1>
        <A2>
          <B2></B2>
          <C>2<D></D></C>
          <E2></E2>
        </A2>
      </root>
        ''')
    return content

content = setup_ABC()
context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
orig_fast_iter(context, lambda elem: None)
# DEBUG:__main__:Deleting B1
# DEBUG:__main__:Deleting B2

print('-'*80)
"""
The improved fast_iter deletes A1. The original fast_iter does not.
"""
content = setup_ABC()
context = ET.iterparse(io.BytesIO(content), events=('end', ), tag='C')
fast_iter(context, lambda elem: None)
# DEBUG:__main__:Deleting B1
# DEBUG:__main__:Deleting A1
# DEBUG:__main__:Deleting B2

因此,您会看到修改后的fast_iter设法删除A1元素,因为在处理第二个C元素时不需要它。原始fast_iter仅删除C元素的父项(即B元素)。你可以想象A1之类的东西在大型XML文件中可能会非常大,而且可能会有很多这样的元素。因此修改后的fast_iter将允许回收原始fast_iter无法释放的大量内存。

答案 2 :(得分:1)

def process_element(catalog, *args, **kwargs):
    for child in catalog.getchildren():
        print(child.text)