我正在尝试使用以下代码解析python 2.7中的xml
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import xml.etree.ElementTree as ET
import sys, json
txtfile='game_file.txt'
def jd(payload):
return json.dumps(payload, sort_keys=True, indent=4)
def parse_demo_txt(demofile):
tree = ET.ElementTree(file=demofile)
scores={}
for player in tree.iter('player'):
if player.attrib['name'] not in scores:
scores[player.attrib['name']]={'death':player.attrib['deaths'], 'win': player.attrib['spree'], 'totalscore': player.attrib['frags']}
return scores
parse_demo_txt(txtfile)
源文件包含一些正在生成错误的非ASCII字符
$ python parse_xml.py
Traceback (most recent call last):
File "parse_xml.py", line 38, in <module>
parse_demo_xml(xmlfile)
File "parse_xml.py", line 18, in parse_demo_xml
tree = ET.ElementTree(file=xmlfile)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 611, in __init__
self.parse(file)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 656, in parse
parser.feed(data)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 1653, in feed
self._raiseerror(v)
File "/usr/lib/python2.7/xml/etree/ElementTree.py", line 1517, in _raiseerror
raise err
xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 5, column 14
实际上,在第5行的第14列中有一些非ASCII字符。除了首先解析该文件并转换这些有问题的字符外,还有其他方法(使用纯元素树)吗?
答案 0 :(得分:0)
您得到的错误不是编码错误,而是XML错误。 XML确实支持非ascii字符(希望!),因此问题不在于非ascii字符,而在于文档中声明ASCII编码的非ascii字符(在<?XML ...?>
行中)。
如果您知道此文档的实际编码,则只需修复encoding
声明。否则,可以将该XML文件发回给创建该文件的人,并要求他提供有效的XML文件。更糟糕的是,您可以尝试使用chardet
或UnicodeDammit
来尝试猜测编码,但是请记住仍然是一个疯狂的猜测。
答案 1 :(得分:0)
好的,我最终编写了自己的函数以从原始文件中删除不需要的字符,然后将其传递给XML解析器。
def normalize_player(demofile):
''' this function will normalize player="" field in xml file by removind non-ascii characters from it
and writting output to name_normalized file.
'''
with open(demofile+'_normalized', 'w') as normalized_file:
with open(demofile, 'r') as inputfile:
for line in inputfile:
if '<player' in line and 'name' in line:
plname=re.findall('name="(.*)" team', line)
# print('line stripped: {}'.format(line.rstrip('\r\n')))
# print('plname: {}'.format(plname))
nname=normalize_nickname(''.join(plname))
# print('plname normalized: {}'.format(nname))
line=line.replace(''.join(plname),nname)
# print('replaced line: {}'.format(line))
normalized_file.write(line)
remove(demofile)
move(demofile+'_normalized', demofile)
称为normalize_nickname()的函数只会将非可打印的ASCII字符替换为普通的ASCII字符。
def normalize_nickname(nickname):
''' this function will normalize nickname by making it readable '''
transformation_table=generate_translation_table()
normalized_nickname=[]
for character in nickname:
normalized_nickname.append(chr(transformation_table[ord(character)]))
return ''.join(normalized_nickname)
最后,generate_translation_table()函数如下所示:
def generate_translation_table():
''' python implementation of https://github.com/deurk/mvdsv/blob/master/src/common.c#L1717 '''
ascii_table={}
# some basic transformations
for i in range (0, 32):
ascii_table[i] = 35 # '#'
ascii_table[i + 128] = 35 # '#'
for i in range (32, 128):
ascii_table[i] = i
ascii_table[i + 128] = i
# special cases
ascii_table[10] = 10
ascii_table[13] = 13
# dot
ascii_table[5] = 46 # '.'
ascii_table[14] = 46 # '.'
ascii_table[15] = 46 # '.'
ascii_table[28] = 46 # '.'
ascii_table[46] = 46 # '.'
ascii_table[5 + 128] = 46 # '.'
ascii_table[14 + 128] = 46 # '.'
ascii_table[15 + 128] = 46 # '.'
ascii_table[28 + 128] = 46 # '.'
ascii_table[46 + 128] = 46 # '.'
# numbers
for i in range (18, 28):
ascii_table[i] = i + 30
ascii_table[i + 128] = i + 30
# brackets
ascii_table[16] = 91 # '['
ascii_table[16 + 128] = 91 # '['
ascii_table[17] = 93 # ']'
ascii_table[17 + 128] = 93 # ']'
ascii_table[29] = 40 # '('
ascii_table[29 + 128] = 40 # '('
ascii_table[128] = 40 # '('
ascii_table[31] = 41 # ')'
ascii_table[31 + 128] = 41 # ')'
ascii_table[130] = 41 # ')'
# left arrow
ascii_table[127] = 62 # '>'
# right arrow
ascii_table[141] = 60 # '<'
# =
ascii_table[30] = 61 # '='
ascii_table[30+128] = 61 # '='
ascii_table[129] = 61 # '='
return ascii_table