我想使用python将unicode转换为拉丁字符,我有一个包含unicode和所有内容的大文本文件。我只想更换4个unicode,如\ u00f6,\ u015f,.. 我只想知道推文是如何实际推文的。(原始语言)。这是实际收集推文并保存到文本文件中的代码。我添加了“#!/ usr / bin / python
class listener(StreamListener):
def on_data(self,data):
try:
dirty = open('turkeyjson28.txt','a')
encode = data.encode('ascii','ignore')
dirty.write(encode)
good = tweet.decode("utf-8") """
better = good.decode("utf=8").replace(u"\u00f6", "ö")
print better
dirty.write('\n')
dirty.close()
tweet = data.split(',"text":"')[1].split('","source')[0]
#saveThis = str(time.time())+'::'+tweet
saveFile = open('turkey_clean28.txt','a')
saveFile.write(better)
saveFile.write('\n')
saveFile.write('\n')
saveFile.close()
return True
except BaseException, e:
print 'failed ondata,',str(e)
time.sleep(5)
def on_error(self, status):
print status
auth = OAuthHandler(ckey,csecret)
auth.set_access_token(atoken,asecret)
twitterStream = Stream(auth,listener())
twitterStream.filter(track = ["turkey"])
答案 0 :(得分:1)
import html
import unicodedata
def normalize(value, encoding=None):
"""
Normalize characters not maintainable when encode
```
# は non-latin-1 and not normalizable
# ő non-latin-1 char but normalizable
# ó latin-1 char
# o ascii char
>>> string = 'は | ő | ó | o'
>>> normalize(string, 'latin-1')
' | o | ó | o'
>>> normalize(string)
' | o | o | o'
```
"""
if encoding is None:
# Normalize non-encoding characters.
# 'は | ő | ó | o' input
return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
# ' | o | o | o' returned string
# 'は | ő | ó | o' input
# Replace with backslashreplace non-encoding characters
value = value.encode(encoding, 'backslashreplace').decode(encoding)
# '\\u306f | \\u0151 | ó | o' funtion output
# Replace with xmlcharrefreplace encoding-non-ascii characters
# and reverce backslashreplace
value = value.encode('ascii', 'xmlcharrefreplace').decode('unicode-escape')
# 'は | ő | ó | o' funtion output
# Normalize non-encoding characters.
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
# ' | o | ó | o' funtion output
# Reverce xmlcharrefreplace
#' | o | ó | o' retured string
return html.unescape(value)
答案 1 :(得分:0)
better = good.decode("utf-8").replace(u"\u00f6", "ö")
更改为
better = good.decode("utf-8").replace(u"\u00f6", u"\u00f6".encode("utf8"))
或作为您需要的文件的第一行
#!/usr/bin/python
# -*- coding: utf8 -*-
一般情况下,我会避免使用编码解决方案,只需将其替换为您想要的unicode字符
我会经常写一对辅助功能来协助这项工作
def decode(byte_str,encodings=["latin1","utf8","cp1252"]):
if not isinstance(byte_str,str) and isinstance(byte_str,unicode):
byte_str = encode(byte_str,encodings)
for enc in encodings:
try:
return byte_str.decode(enc)
except UnicodeDecodeError:
continue
def encode(unicode_txt,encodings=["latin1","utf8","cp1252"]):
if not isinstance(unicode_txt,unicode) and isinstance(unicode_txt,str):
unicode_txt = decode(unicode_txt,encodings)
for enc in encodings:
try:
return unicode_txt.encode(enc)
except UnicodeDecodeError:
continue
#then you can just do something like
decode(good).replace(u"\u00f6",decode(u"\u00f6",encodings=["utf8","latin1","ascii"]))