合并大量XML文件

时间:2015-09-18 12:21:24

标签: python xml elementtree

我有很多需要合并的xml文件。我在merging xml files using python's ElementTree尝试了此链接 其代码是(根据我的需要编辑):

import os, os.path, sys
import glob
from xml.etree import ElementTree

def run(files):
    xml_files = glob.glob(files +"/*.xml")
    xml_element_tree = None
    for xml_file in xml_files:
        print xml_file
        data = ElementTree.parse(xml_file).getroot()
        # print ElementTree.tostring(data)
        for result in data.iter('TALLYMESSAGE'):
            if xml_element_tree is None:
                xml_element_tree = data 
                insertion_point = xml_element_tree.findall("./BODY/DATA/TALLYMESSAGE")[0]
            else:
                insertion_point.extend(result) 
    if xml_element_tree is not None:
        f =  open("myxmlfile.xml", "wb")
        f.write(ElementTree.tostring(xml_element_tree))
run("F:/data/data")

但问题是我有很多XML文件,365个是精确的,每个至少2 MB。合并它们都会导致我的电脑崩溃。 这是我的xml文件的xml树的图像: XML element tree

我的新更新代码是:

import os, os.path, sys
import glob
from lxml import etree
def XSLFILE(files):
    xml_files = glob.glob(files +"/*.xml")
    #print xml_files[0]
    xslstring = """<?xml version="1.0" ?> 
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> 
<xsl:template match="/DATA">
<DATA>
<xsl:copy>
<xsl:copy-of select="TALLYMESSAGE"/>\n"""
    #print xslstring
    for xmlfile in xml_files[1:]:
        xslstring = xslstring + '<xsl:copy-of select="document(\'' + xmlfile[-16:] + "')/BODY/DATA/TALLYMESSAGE\"/>\n"
    xslstring = xslstring + """</xsl:copy>+
</DATA>
</xsl:template> 
</xsl:transform>"""
    #print xslstring
    with open("parsingxsl.xsl", "w") as f:
        f.write(xslstring)
    with open(xml_files[0], "r") as f:
        dom = etree.XML(f.read())
    print etree.tostring(dom)
    with open('F:\data\parsingxsl.xsl', "r") as f:
        xslt_tree = etree.XML(f.read())
    print xslt_tree
    transform = etree.XSLT(xslt_tree)
    newdom = transform(dom)
    #print newdom
    tree_out = etree.tostring(newdom, encoding='UTF-8', pretty_print=True,  xml_declaration=True)
    print(tree_out)

    xmlfile = open('F:\data\OutputFile.xml','wb')
    xmlfile.write(tree_out)
    xmlfile.close()
XSLFILE("F:\data\data")

运行时相同会产生以下错误:

Traceback (most recent call last):
  File "F:\data\xmlmergexsl.py", line 38, in <module>
    XSLFILE("F:\data\data")
  File "F:\data\xmlmergexsl.py", line 36, in XSLFILE
    xmlfile.write(tree_out)
TypeError: must be string or buffer, not None

1 个答案:

答案 0 :(得分:2)

考虑使用XSLT及其document()函数来合并XML文件。 Python(与许多面向对象的编程语言一样)在其lxml模块中维护XSLT processor。作为信息,XSLT是一种声明性编程语言,用于转换各种格式和结构的XML文件。

出于您的目的,XSLT可能比使用编程代码开发文件更有效,因为除了XSLT处理器将使用的内容之外,在处理期间没有列表或循环或其他对象保存在内存中。

XSLT (在外部保存为.xsl文件)

考虑最初运行Python写入文本文件循环以填充所有365个文档以避免复制和粘贴。另请注意,第一个文档被跳过,因为它是下面Python脚本中使用的起点:

<?xml version="1.0" ?> 
<xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> 

 <xsl:template match="DATA">
  <DATA>
    <xsl:copy> 
       <xsl:copy-of select="TALLYMESSAGE"/>
       <xsl:copy-of select="document('Document2.xml')/BODY/DATA/TALLYMESSAGE"/>
       <xsl:copy-of select="document('Document3.xml')/BODY/DATA/TALLYMESSAGE"/>
       <xsl:copy-of select="document('Document4.xml')/BODY/DATA/TALLYMESSAGE"/>
       ...
       <xsl:copy-of select="document('Document365.xml')/BODY/DATA/TALLYMESSAGE"/>             
    </xsl:copy>
  </DATA>
 </xsl:template> 

</xsl:transform>

Python (将包含在您的整体脚本中)

import lxml.etree as ET

dom = ET.parse('C:\Path\To\XML\Document1.xml')
xslt = ET.parse('C:\Path\To\XSL\file.xsl')
transform = ET.XSLT(xslt)
newdom = transform(dom)

tree_out = ET.tostring(newdom, encoding='UTF-8', pretty_print=True,  xml_declaration=True)
print(tree_out)

xmlfile = open('C:\Path\To\XML\OutputFile.xml','wb')
xmlfile.write(tree_out)
xmlfile.close()