我正在使用iterparse
来填充Python中的大型(~1GB)XML文件。一切看起来都不错,除了我无法在empty lines
标签之后删除那些</ROW>
。
我尝试添加elem.text.strip()+"\n\t\t
和elem.tail.strip()+"\n\t
,这是有效的。但我不确定这是否是处理它的最佳方法以及如何在tab
标记之前移除</data>
。
xml_str = """
<DATA>
<ROW>
<assmtid>1</assmtid>
<Year>1988</Year>
</ROW>
<ROW>
<assmtid>2</assmtid>
<Year>NULL</Year>
</ROW>
<ROW>
<assmtid>2</assmtid>
<Year>1990</Year>
</ROW>
</DATA>
"""
def getelements(pth, ele_xml):
context = iter(etree.iterparse(StringIO(xml_str), events=('start', 'end')))
event, root = next(context)
for event, elem in context:
if event == 'end' and elem.tag == "ROW":
elem.tag = elem.tag.lower()
elem.text = elem.text.strip()+"\n\t\t"
elem.tail = elem.tail.strip()+"\n\t"
for child in elem:
child.tag = child.tag.lower()
if child.text == "NULL":
child.text = ""
if child.text == None:
child.text = ""
# print event, elem.tag
yield elem
root.clear()
with open(pth_new_to_xml, 'wb') as file:
# start root
file.write('<data>\n\r')
for page in getelements(pth, ele_xml):
file.write(etree.tostring(page, encoding='utf-8'))
# close root
file.write('</data>')
<data>
<row>
<assmtid>1</assmtid>
<year>1988</year>
</row>
<row>
<assmtid>2</assmtid>
<year />
</row>
<row>
<assmtid>2</assmtid>
<year>1990</year>
</row>
</data>