我正在使用python与scikit-learn和nltk进行电影评论的情绪分析。我想将与unigram相关的元素等同于0(当它们具有相反极性时),当与那些单字组相关的二元组/三元组不为零时。
例如:
movie is not bad
比特征向量为['movie' 'is' 'not' 'bad' 'movie is' 'is not' 'not bad']=[3 3 1 1 4 2 4]
但我想改为[3 3 0 0 4 2 4]
。
代码:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import math
#######################Reading Training Review Phrases and Sentiments###################################
train_list = []
train_sentiment = []
with open('sentences.txt') as f:
content = f.readlines()
for sentence in content:
train_list.append(sentence.rstrip('\n').split("\t")[0])
train_sentiment.append(sentence.rstrip('\n').split("\t")[1])
#######################Number of phrases in each class###################################
ex_pos = pos = neu = neg = ex_neg = 0
ex_pos_phrases = pos_phrases = neu_phrases = neg_phrases = ex_neg_phrases = []
with open('ex_pos.txt', 'r') as ex_posF:
ex_pos_phrases = ex_posF.readlines()
ex_pos = len(ex_pos_phrases)
with open('pos.txt', 'r') as posF:
pos_phrases = posF.readlines()
pos = len(pos_phrases)
with open('neu.txt', 'r') as neuF:
neu_phrases = neuF.readlines()
neu = len(neu_phrases)
with open('neg.txt', 'r') as negF:
neg_phrases = negF.readlines()
neg = len(neg_phrases)
with open('ex_neg.txt', 'r') as ex_negF:
ex_neg_phrases = ex_negF.readlines()
ex_neg = len(ex_neg_phrases)
print(str(ex_neg) + "," + str(neg) + "," + str(neu) + "," + str(pos) + "," + str(ex_pos))
####################### Getting unique Words ###################################
unique_words = []
model = TfidfVectorizer(input=train_list)
train_tfidf = model.fit_transform(train_list)
unique_words = model.get_feature_names()
print("##### Word sentiment matrix ####")
########################## Word sentiment matrix ########################################
word_sentiment = [[0 for x in range(5)] for x in range(len(unique_words)) ]
wordcount = 0
for word in unique_words:
count = 0
for review in ex_neg_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][0] = count
count = 0
for review in neg_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][1] = count
count = 0
for review in neu_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][2] = count
count = 0
for review in ex_pos_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][4] = count
count = 0
for review in pos_phrases:
review_words = review.rstrip('\n').split(" ")
for review_word in review_words:
if review_word == word:
count += 1
break
word_sentiment[wordcount][3] = count
wordcount += 1
print("###The Training feature matrix###")
#################################The feature matrix#######################################
feature_matrix = [[0 for x in range(len(unique_words))] for x in range(len(train_list))]
print(len(feature_matrix))
print(len(feature_matrix[0]))
wordcount = 0
for unique_word in unique_words:
phrasecount = 0
ep = p = nu = en = n = 0
if word_sentiment[wordcount][4] != 0:
ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)
if word_sentiment[wordcount][3] != 0:
p = .15 * math.log(word_sentiment[wordcount][3]/pos)
if word_sentiment[wordcount][2] != 0:
nu = 1 * math.log(word_sentiment[wordcount][2]/neu)
if word_sentiment[wordcount][0] != 0:
en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)
if word_sentiment[wordcount][1] != 0:
n = -.15 * math.log(word_sentiment[wordcount][1]/neg)
for phrase in train_list:
words = phrase.split(" ")
docwordcount = 0
for word in words:
if word == unique_word:
docwordcount += 1
tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)
feature_matrix[phrasecount][wordcount] = tfidf
phrasecount += 1
wordcount += 1
print("###The test feature matrix###")
test_list=[]
test_phraseid =[]
with open('sentences_test.txt') as f:
content = f.readlines()
for sentence in content:
test_list.append(sentence.rstrip('\n').split("\t")[0])
test_phraseid.append(sentence.rstrip('\n').split("\t")[1])
wordcount = 0
test_tfidf = [[0 for x in range(len(unique_words))] for x in range(len(test_list))]
for unique_word in unique_words:
phrasecount = 0
ep = p = nu = en = n = 0
if word_sentiment[wordcount][4] != 0:
ep = .35 * math.log(word_sentiment[wordcount][4]/ex_pos)
if word_sentiment[wordcount][3] != 0:
p = .15 * math.log(word_sentiment[wordcount][3]/pos)
if word_sentiment[wordcount][2] != 0:
nu = 1 * math.log(word_sentiment[wordcount][2]/neu)
if word_sentiment[wordcount][0] != 0:
en = -.35 * math.log(word_sentiment[wordcount][0]/ex_neg)
if word_sentiment[wordcount][1] != 0:
n = -.15 * math.log(word_sentiment[wordcount][1]/neg)
for phrase in test_list:
words = phrase.split(" ")
docwordcount = 0
for word in words:
if word == unique_word:
docwordcount += 1
tfidf = (docwordcount * ep) + (docwordcount * p) + (docwordcount * nu) + (docwordcount * en) + (docwordcount * n)
test_tfidf[phrasecount][wordcount] = tfidf
phrasecount += 1
wordcount += 1
print("###The Linear SVC ###")
self = LinearSVC()
self = LinearSVC.fit(self, feature_matrix, train_sentiment)
test_sentiment = LinearSVC.predict(self, test_tfidf)
with open('output_deltatfidf.csv', 'w') as fil:
fil.write("PhraseId,Sentiment\n")
for x in range(0, len(test_sentiment)):
fil.write(test_phraseid[x] + "," + test_sentiment[x] + "\n")