UnicodeDecodeError:使用文本

时间:2016-12-04 16:00:19

标签: python python-2.7

我使用其需要执行文本处理的API收集了一些推文。现在很多推文都以文本形式出现了表情符号,例如:D:P等。

这是用python中的单词替换俚语表达式的代码:

"""
exp_replace_copy.py
"""

import nltk
import re

#dictionnary to sentiment analysis

emo_repl = {
#good emotions
"<3" : " good ",
":d" : " good ",
":D" : " good ",
":p" : " good ",
"8)" : " good ",
":-)" : " good ",
":)" : " good ",
";)" : " good ",
"(-:" : " good ",
"(:" : " good ",

"yay!" : " good ",
"yay" : " good ",
"yaay" : " good ",
"yaaay" : " good ",
"yaaaay" : " good ",
"yaaaaay" : " good ",    
#bad emotions
":/" : " bad ",
":>" : " sad ",
":')" : " sad ",
":-(" : " bad ",
":(" : " bad ",
":s" : " bad ",
":-s" : " bad "
}

#dictionnary for general (i.e. topic modeler)
emo_repl2 = {
#good emotions
"<3" : " heart ",
":d" : " smile ",
":p" : " smile ",
":D" : " smile ",
"8)" : " smile ",
":-)" : " smile ",
":)" : " smile ",
";)" : " smile ",
"(-:" : " smile ",
"(:" : " smile ",

#bad emotions
":/" : " worry ",
":>" : " angry ",
":')" : " sad ",
":-(" : " sad ",
":(" : " sad ",
":s" : " sad ",
":-s" : " sad "
}

#general
re_repl = {
r"\br\b" : "are",
r"\bu\b" : "you",
r"\bhaha\b" : "ha",
r"\bhahaha\b" : "ha",
r"\bdon't\b" : "do not",
r"\bdoesn't\b" : "does not",
r"\bdidn't\b" : "did not",
r"\bhasn't\b" : "has not",
r"\bhaven't\b" : "have not",
r"\bhadn't\b" : "had not",
r"\bwon't\b" : "will not",
r"\bwouldn't\b" : "would not",
r"\bcan't\b" : "can not",
r"\bcannot\b" : "can not"    
}

emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]
emo_repl_order2 = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl2.keys()]))]

def replace_emo(sentence):
    sentence2 = sentence
    for k in emo_repl_order:
        sentence2 = sentence2.replace(k,emo_repl[k])
    for r, repl in re_repl.iteritems():
        sentence2 = re.sub(r,repl,sentence2)
return sentence2

def replace_reg(sentence):
    sentence2 = sentence
    for k in emo_repl_order2:
        sentence2 = sentence2.replace(k,emo_repl2[k]) #line 95
    for r, repl in re_repl.iteritems(): 
        sentence2 = re.sub(r,repl,sentence2)
return sentence2

我使用脚本进行主题建模:

from gensim import corpora, models, similarities
import numpy as np
import nltk
import exp_replace_copy
from nltk.corpus import stopwords

class topic(object):

    def __init__(self,nbtopic=100,alpha=1,model=None,dicttp=None):

        self.nbtopic = nbtopic
        self.porter = nltk.PorterStemmer()
        self.alpha = alpha
        self.stop = stopwords.words('english')+['.','!','?','"','...','\\',"''",'[',']','~',"'m","'s",';',':','..','$']
        if model!=None and dicttp!=None:
            self.lda = models.ldamodel.LdaModel.load(model)
            self.dictionary =  corpora.Dictionary.load(dicttp)

    def fit(self,documents):

        documents_mod = [exp_replace_copy.replace_reg(sentence) for sentence in documents]
        tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
        tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]        

        self.dictionary = corpora.Dictionary(tokens)
        corpus = [self.dictionary.doc2bow(text) for text in tokens]
        self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)

        self.lda.save('topics.tp')
        self.dictionary.save('topics_dict.tp')

    def get_topic(self,topic_number):

        return self.lda.print_topic(topic_number)

    def transform(self,sentence):

        sentence_mod = exp_replace_copy.replace_reg(sentence)
        tokens = nltk.word_tokenize(sentence_mod)
        tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop] 
        corpus_sentence = self.dictionary.doc2bow(tokens)

    return self.lda[corpus_sentence]  

运行此脚本时出现错误:

Traceback (most recent call last):
File "traintest_copy.py", line 27, in <module>
topic_mod.fit(np.concatenate((pos_data,neg_data)))
 File "/home/shadow/Documents/Sarcasm_Detectorr/topic_copy.py", line 24, in fit
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 106, in word_tokenize
 return [token for sent in sent_tokenize(text, language)
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 91, in sent_tokenize
 return tokenizer.tokenize(text)
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1226, in tokenize
 return list(self.sentences_from_text(text, realign_boundaries))
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1274, in sentences_from_text
 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1265, in span_tokenize
 return [(sl.start, sl.stop) for sl in slices]
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1304, in _realign_boundaries
 for sl1, sl2 in _pair_iter(slices):
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
 for el in it:
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1280, in _slices_from_text
 if self.text_contains_sentbreak(context):
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1325, in text_contains_sentbreak
 for t in self._annotate_tokens(self._tokenize_words(text)):
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1460, in _annotate_second_pass
 for t1, t2 in _pair_iter(tokens):
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
 prev = next(it)
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
 for aug_tok in tokens:
 File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
 for line in plaintext.split('\n'):
 UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128)

字典有问题吗?我弄不清楚。我使用的是python 2.7

编辑:json到csv转换的代码

import json
import csv

x = open('myjsonfile.json')
data = json.load(x)
x.close()

f = csv.writer(open('mycsvfile.csv', 'wb+'))
for item in data:
    f.writerow([item["text"].replace('\n',' ').encode('utf-8')])

新编辑:我写了一个解决方法代码,删除旧csv文件中的所有非ascii字符

import csv

x = csv.reader(open('unamusing.csv','rb+'))
f = csv.writer(open('unamusing1.csv', 'wb+'))
out_txt=[]
for item in x:
    out_txt.append(["".join(a if ord(a)<128 else '' for a in i)for i in item])

f.writerows(out_txt)

0 个答案:

没有答案