如何使用Python3拆分18 GB xml文件?

时间:2014-12-30 19:46:19

标签: xml large-files python-3.4

我需要将一个大的xml文件(18GB)拆分成较小的文件,同时保持原始的xml结构。文件中的所有元素都属于单个根元素。

我使用readlines()方法取得了成功,但是,我想知道是否有更好的方法可以使用Python3。

经过一番搜索后,我发现了一些有用的代码似乎正在运行但需要相当长的时间。必须有一个更快的方法来做到这一点 - 任何帮助都表示赞赏。

这会将文件拆分为~21个较小的文件:

#num_lines = sum(1 for line in open(r'largefile.xml', encoding = "utf8"))
#print(num_lines)
#14461067

chunksize = 700000 

fid = 0
with open(r'largefile.xml', encoding = "utf8") as infile:
    f = open('file%d.xml' %fid, 'w')
    for i,line in enumerate(infile):
        f.write(line)
        if not i%chunksize:
            f.close()
            fid += 1
            f = open('file%d.txt' %fid, 'w', encoding = "utf8")
    f.close()

1 个答案:

答案 0 :(得分:0)

GitHub上有一个script我以前用过这个确实的事情。

#!/usr/bin/env python

import os
import xml.parsers.expat
from xml.sax.saxutils import escape
from optparse import OptionParser
from math import log10


# How much data we process at a time
CHUNK_SIZE = 1024 * 1024

# The sequence of element leading us to the current one
path = []

# How far we are in the current file
cur_size = 0
# From how much should we start another file
MAX_SIZE = 1024*1024 # 1Mb

# The current index
cur_idx = 0
# The current file handle we are writing to
cur_file = None

# The format string used to introduce the index in the file to be written
FMT = ".%d"

# The filename we are playing with
out_dir = None
root = None
ext = None

# The xml declaration of the file.
xml_declaration = None

# What was the signature of the last start element
start = None

# if we are currently in the process of changing file
ending = False

def attrs_s(attrs):
    """ This generate the XML attributes from an element attribute list """
    l = ['']
    for i in range(0,len(attrs), 2):
        l.append('%s="%s"' % (attrs[i], escape(attrs[i+1])))
    return ' '.join(l)

def next_file():
    """ This makes the decision to cut the current file and starta new one """
    global cur_size, ending
    if (not ending) and (cur_size > MAX_SIZE):
        # size above threshold, and not already ending
        global cur_file, cur_idx
        print "part %d Done" % cur_idx
        ending = True
        # Close the current elements
        for elem in reversed(path):
            end_element(elem[0])
        # Close the file
        cur_file.close()
        # reset the size
        cur_size = 0
        # Open another file
        cur_idx += 1
        cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext),
                        'wt')
        if xml_declaration is not None:
            cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))
        # Start again where we stopped
        for elem in path:
            start_element(*elem)
        # We are done 'ending'
        ending = False


def xml_decl(version, encoding, standalone):
    global xml_declaration
    l = ['version', version, 'encoding', encoding]
    if standalone != -1:
        l.extend(['standalone', 'yes' if standalone else 'no'])
    xml_declaration = l
    cur_file.write('<?xml%s?>\n' % attrs_s(xml_declaration))


def start_element(name, attrs):
    """ Called by the parser when he meet a start element """
    global cur_size, start
    if start is not None:
        # Chaining starts after each others
        cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
    start = (name, attrs)
    if ending:
        return
    cur_size += len(name) + sum(len(k) for k in attrs)
    path.append((name, attrs))


def end_element(name):
    """ Caled by the parser when he meet an end element """
    global cur_size
    global start
    if start is not None:
        # Empty element, good, we did not wrote the start part
        cur_file.write('<%s%s/>' % (start[0],attrs_s(start[1])))
    else:
        # There was some data, close it normaly
        cur_file.write('</%s>' % name)
    start = None
    if ending:
        return
    elem = path.pop()
    assert elem[0] == name
    cur_size += len(name)
    next_file()


def char_data(data):
    """ Called by the parser when he meet data """
    global cur_size, start
    wroteStart = False
    if start is not None:
        # The data belong to an element, we should write the start part first
        cur_file.write('<%s%s>' % (start[0], attrs_s(start[1])))
        start = None
        wroteStart = True
    # ``escape`` is too much for us, only & and < ned to be escaped there ...
    data = data.replace('&', '&amp;')
    data = data.replace('<', '&lt;')
    if data == '>':
        data = '&gt;'
    cur_file.write(data.encode('utf-8'))
    cur_size += len(data)
    if not wroteStart:
        # The data was outside of an element, it could be the right moment to
        # make the split
        next_file()

def main(filename, output_dir):
    # Create a parser
    p = xml.parsers.expat.ParserCreate()
    # We want to reproduce the input, so we are interested in the order of the
    # attributess
    p.ordered_attributes = 1

    # Set our callbacks (we are stripping comments out by not defining
    # callbacks for them)
    p.XmlDeclHandler = xml_decl
    p.StartElementHandler = start_element
    p.EndElementHandler = end_element
    p.CharacterDataHandler = char_data

    global cur_file, cur_idx
    global out_dir, root, ext

    global FMT
    FMT = ".%%0%dd" % (int(log10(os.path.getsize(filename) / MAX_SIZE)) + 1)

    out_dir, filename = os.path.split(filename)
    if output_dir is not None:
        out_dir = output_dir

    root, ext = os.path.splitext(filename)

    cur_file = open(os.path.join(out_dir, root + FMT % cur_idx + ext), 'wt')

    with open(filename, 'rt') as xml_file:
        while True:
            # Read a chunk
            chunk = xml_file.read(CHUNK_SIZE)
            if len(chunk) < CHUNK_SIZE:
                # End of file
                # tell the parser we're done
                p.Parse(chunk, 1)
                # exit the loop
                break
            # process the chunk
            p.Parse(chunk)

    # Don't forget to close our handle
    cur_file.close()

    print "part %d Done" % cur_idx

if __name__ == "__main__":
    parser = OptionParser(usage="usage: %prog [options] XML_FILE")
    parser.add_option("-o", "--output-dir",
        help="Specify the directory where the xml files will be written" \
            "(default to the same directory where the original file is)")
    parser.add_option("-M", "--max_size", type="int",
        help="Specify the size at which the files should be split (in Kb)")
    (options, args) = parser.parse_args()
    if len(args) != 1:
        parser.error("incorrect number of arguments")
    if options.max_size is not None:
        MAX_SIZE = options.max_size * 1024
    main(args[0], options.output_dir)