Iam尝试将大型XML文件转换为CSV格式,下面是我的代码和示例文件,示例文件(xml文件的一部分)
<PostalAddress>
<Id>5464443597076195439</Id>
<AddUserId>SYSTEM_USER</AddUserId>
<AddDate>2013-01-05T18:08:42-06:00</AddDate>
<LastPersistenceTransactionUserId>SYSTEM_USER</LastPersistenceTransactionUserId>
<LastPersistenceTransactionDate>2013-07-11T08:21:34-05:00</LastPersistenceTransactionDate>
<LastPersistenceTransactionType tc="2"/>
<ExternalReferenceId>3200723</ExternalReferenceId>
<SchemaVersion>2</SchemaVersion>
<Type tc="1"/>
<Usage tc="2"/>
<Valid>true</Valid>
<Overridable>true</Overridable>
<Preferred>false</Preferred>
<Line1>4849 RONSON CT</Line1>
<Line2>STE 208</Line2>
<City>SAN DIEGO</City>
<State tc="6"/>
<PostalCode>92111</PostalCode>
<Country tc="1"/>
</PostalAddress>
以下是我的代码
import codecs
import xml.etree.ElementTree as et
import sys
class gokul:
def __init__(self, input_file, output_file, encoding='utf-8'):
self.output_buffer = []
self.output = None
self.context = et.iterparse(input_file, from bs4 import BeautifulSoup=("start", "end"))
try:
self.output = codecs.open(output_file, "w", encoding=encoding)
except:
print("Failed to open the output file")
raise
def convert(self, tag="item", delimiter=",", ignore=[], noheader=False,
limit=-1, buffer_size=1000):
event, root = next(self.context)
items = []
header_line = []
field_name = ''
tagged = False
started = False
n = 0
for event, elem in self.context:
should_write = elem.tag != tag and started and elem.tag not in ignore
should_tag = not tagged and should_write and not noheader
i=0
if event == 'start':
if elem.tag == tag and not started:
started = True
elif should_tag:
field_name = '_'.join((field_name, elem.tag)) if field_name else elem.tag
else:
if should_write:
if should_tag:
header_line.append(field_name)
field_name = field_name.rpartition('_' + elem.tag)[0]
items.append('' if elem.text is None else elem.text.strip().replace('"', r'""'))
elif elem.tag == tag and len(items) > 0:
if header_line and not tagged:
self.output.write(delimiter.join(header_line) + '\n')
tagged = True
self.output_buffer.append(r'"' + (r'"' + delimiter + r'"').join(items) + r'"')
items = []
n += 1
if n == limit:
break
if len(self.output_buffer) > buffer_size:
self._write_buffer()
elem.clear()
self._write_buffer()
self.output.close()
return n
def _write_buffer(self):
"""Write records from buffer to the output file"""
self.output.write('\n'.join(self.output_buffer) + '\n')
self.output_buffer = []
converter = gokul(sys.argv[1], sys.argv[2], encoding="utf-8")
converter.convert(tag=sys.argv[3])