我使用其需要执行文本处理的API收集了一些推文。现在很多推文都以文本形式出现了表情符号,例如:D:P等。
这是用python中的单词替换俚语表达式的代码:
"""
exp_replace_copy.py
"""
import nltk
import re
#dictionnary to sentiment analysis
emo_repl = {
#good emotions
"<3" : " good ",
":d" : " good ",
":D" : " good ",
":p" : " good ",
"8)" : " good ",
":-)" : " good ",
":)" : " good ",
";)" : " good ",
"(-:" : " good ",
"(:" : " good ",
"yay!" : " good ",
"yay" : " good ",
"yaay" : " good ",
"yaaay" : " good ",
"yaaaay" : " good ",
"yaaaaay" : " good ",
#bad emotions
":/" : " bad ",
":>" : " sad ",
":')" : " sad ",
":-(" : " bad ",
":(" : " bad ",
":s" : " bad ",
":-s" : " bad "
}
#dictionnary for general (i.e. topic modeler)
emo_repl2 = {
#good emotions
"<3" : " heart ",
":d" : " smile ",
":p" : " smile ",
":D" : " smile ",
"8)" : " smile ",
":-)" : " smile ",
":)" : " smile ",
";)" : " smile ",
"(-:" : " smile ",
"(:" : " smile ",
#bad emotions
":/" : " worry ",
":>" : " angry ",
":')" : " sad ",
":-(" : " sad ",
":(" : " sad ",
":s" : " sad ",
":-s" : " sad "
}
#general
re_repl = {
r"\br\b" : "are",
r"\bu\b" : "you",
r"\bhaha\b" : "ha",
r"\bhahaha\b" : "ha",
r"\bdon't\b" : "do not",
r"\bdoesn't\b" : "does not",
r"\bdidn't\b" : "did not",
r"\bhasn't\b" : "has not",
r"\bhaven't\b" : "have not",
r"\bhadn't\b" : "had not",
r"\bwon't\b" : "will not",
r"\bwouldn't\b" : "would not",
r"\bcan't\b" : "can not",
r"\bcannot\b" : "can not"
}
emo_repl_order = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl.keys()]))]
emo_repl_order2 = [k for (k_len,k) in reversed(sorted([(len(k),k) for k in emo_repl2.keys()]))]
def replace_emo(sentence):
sentence2 = sentence
for k in emo_repl_order:
sentence2 = sentence2.replace(k,emo_repl[k])
for r, repl in re_repl.iteritems():
sentence2 = re.sub(r,repl,sentence2)
return sentence2
def replace_reg(sentence):
sentence2 = sentence
for k in emo_repl_order2:
sentence2 = sentence2.replace(k,emo_repl2[k]) #line 95
for r, repl in re_repl.iteritems():
sentence2 = re.sub(r,repl,sentence2)
return sentence2
我使用脚本进行主题建模:
from gensim import corpora, models, similarities
import numpy as np
import nltk
import exp_replace_copy
from nltk.corpus import stopwords
class topic(object):
def __init__(self,nbtopic=100,alpha=1,model=None,dicttp=None):
self.nbtopic = nbtopic
self.porter = nltk.PorterStemmer()
self.alpha = alpha
self.stop = stopwords.words('english')+['.','!','?','"','...','\\',"''",'[',']','~',"'m","'s",';',':','..','$']
if model!=None and dicttp!=None:
self.lda = models.ldamodel.LdaModel.load(model)
self.dictionary = corpora.Dictionary.load(dicttp)
def fit(self,documents):
documents_mod = [exp_replace_copy.replace_reg(sentence) for sentence in documents]
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
tokens = [[self.porter.stem(t.lower()) for t in sentence if t.lower() not in self.stop] for sentence in tokens]
self.dictionary = corpora.Dictionary(tokens)
corpus = [self.dictionary.doc2bow(text) for text in tokens]
self.lda = models.ldamodel.LdaModel(corpus,id2word=self.dictionary, num_topics=self.nbtopic,alpha=self.alpha)
self.lda.save('topics.tp')
self.dictionary.save('topics_dict.tp')
def get_topic(self,topic_number):
return self.lda.print_topic(topic_number)
def transform(self,sentence):
sentence_mod = exp_replace_copy.replace_reg(sentence)
tokens = nltk.word_tokenize(sentence_mod)
tokens = [self.porter.stem(t.lower()) for t in tokens if t.lower() not in self.stop]
corpus_sentence = self.dictionary.doc2bow(tokens)
return self.lda[corpus_sentence]
运行此脚本时出现错误:
Traceback (most recent call last):
File "traintest_copy.py", line 27, in <module>
topic_mod.fit(np.concatenate((pos_data,neg_data)))
File "/home/shadow/Documents/Sarcasm_Detectorr/topic_copy.py", line 24, in fit
tokens = [nltk.word_tokenize(sentence) for sentence in documents_mod]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 106, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.py", line 91, in sent_tokenize
return tokenizer.tokenize(text)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1226, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1274, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1265, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1304, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 311, in _pair_iter
for el in it:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1280, in _slices_from_text
if self.text_contains_sentbreak(context):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1325, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 1460, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 310, in _pair_iter
prev = next(it)
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "/usr/local/lib/python2.7/dist-packages/nltk/tokenize/punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128)
字典有问题吗?我弄不清楚。我使用的是python 2.7
编辑:json到csv转换的代码
import json
import csv
x = open('myjsonfile.json')
data = json.load(x)
x.close()
f = csv.writer(open('mycsvfile.csv', 'wb+'))
for item in data:
f.writerow([item["text"].replace('\n',' ').encode('utf-8')])
新编辑:我写了一个解决方法代码,删除旧csv文件中的所有非ascii字符
import csv
x = csv.reader(open('unamusing.csv','rb+'))
f = csv.writer(open('unamusing1.csv', 'wb+'))
out_txt=[]
for item in x:
out_txt.append(["".join(a if ord(a)<128 else '' for a in i)for i in item])
f.writerows(out_txt)