我正在尝试针对一组公寓评论编写情感分析算法,并且我希望能够对某些类别(例如“位置”、“社交性”、“清洁度”等)的正面和负面评分进行分类. 我的基本模型是一个使用词袋特征提取方法的朴素贝叶斯分类器。我真的很难找到一种方法来为我上面提到的类别添加特征提取,无论是通过调整我当前的模型还是使用不同的方法。
谁能帮我解决这个问题?我的基本模型如下所示以供参考。
# Preprocessing
pos_reviews = []
for fileid in movie_reviews.fileids('pos'):
words = movie_reviews.words(fileid)
pos_reviews.append(words)
neg_reviews = []
for fileid in movie_reviews.fileids('neg'):
words = movie_reviews.words(fileid)
neg_reviews.append(words)
# Bag of words
def bag_of_words(words):
words_clean = []
for word in words:
word = word.lower()
if word not in stopwords.words("english") and word not in punctuation:
words_clean.append(word)
words_dictionary = dict([word, True] for word in words_clean)
return words_dictionary
pos_reviews_set = []
for words in pos_reviews:
pos_reviews_set.append((bag_of_words(words), 'pos'))
neg_reviews_set = []
for words in neg_reviews:
neg_reviews_set.append((bag_of_words(words), 'neg'))
# TODO: add feature extractors for different categories
# Get train and test sets
shuffle(pos_reviews_set)
shuffle(neg_reviews_set)
test_set = pos_reviews_set[:200] + neg_reviews_set[:200]
train_set = pos_reviews_set[200:] + neg_reviews_set[200:]
# Set up model
model = NaiveBayesClassifier.train(train_set)
accuracy = classify.accuracy(model, test_set)
print("accuracy: " + str(accuracy))