我正在尝试使用此功能:
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
在下面的代码中(使用非ASCII字符串解压缩和读取文件)。但我收到此错误,(来自此库文件C:\Python27\Lib\encodings\utf_8.py
):
Message File Name Line Position
Traceback
<module> C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 64
getNameList C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 26
remove_accents C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py 17
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe1 in position 3: ordinal not in range(128)
为什么我收到此错误?如何避免它并让remove_accents
工作?
感谢您的帮助!
以下是整个代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import os
import re
from zipfile import ZipFile
import csv
##def strip_accents(s):
## return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))
import unicodedata
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', unicode(input_str))
return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
def getNameList():
namesDict=extractNamesDict()
maleNames=list()
femaleNames=list()
for name in namesDict:
print name
# name = strip_accents(name)
name = remove_accents(name)
counts=namesDict[name]
tuple=(name,counts[0],counts[1])
if counts[0]>counts[1]:
maleNames.append(tuple)
elif counts[1]>counts[0]:
femaleNames.append(tuple)
names=(maleNames,femaleNames)
# print maleNames
return names
def extractNamesDict():
zf=ZipFile('names.zip', 'r')
filenames=zf.namelist()
names=dict()
genderMap={'M':0,'F':1}
for filename in filenames:
file=zf.open(filename,'r')
rows=csv.reader(file, delimiter=',')
for row in rows:
#name=row[0].upper().decode('latin1')
name=row[0].upper()
gender=genderMap[row[1]]
count=int(row[2])
if not names.has_key(name):
names[name]=[0,0]
names[name][gender]=names[name][gender]+count
file.close()
# print '\tImported %s'%filename
# print names
return names
if __name__ == "__main__":
getNameList()
答案 0 :(得分:3)
如果要在字符串中将unicode字符稳健地转换为ascii,则应使用令人敬畏的unidecode模块:
>>> import unidecode
>>> unidecode.unidecode(u'Björk')
'Bjork'
>>> unidecode.unidecode(u'András Sütő')
'Andras Suto'
>>> unidecode.unidecode(u'Ελλάς')
'Ellas'
答案 1 :(得分:2)
你得到它是因为你在没有指定编解码器的情况下从字节串解码:
unicode(input_str)
在那里添加编解码器(这里我假设您的数据以utf-8编码,0xe1
将是3字节字符中的第一个):
unicode(input_str, 'utf8')
答案 2 :(得分:2)
最佳做法是在数据进入程序时解码为Unicode:
for row in rows:
name=row[0].upper().decode('utf8') # or whatever...you DO need to know the encoding.
然后remove_accents
可以是:
def remove_accents(input_str):
nkfd_form = unicodedata.normalize('NFKD', input_str)
return u''.join(c for c in nkfd_form if not unicodedata.combining(c))
离开程序时编码数据,如写入文件,数据库,终端等
为什么要先删除重音?