我指的是http://www.nltk.org/book/ch06.html来生成电影评论分类器。分类器将与(名词,形容词,动词......)对应的单词视为特征集的一部分。我正在尝试建立一个只考虑动词和评估的分类器,如果电影评论是正面的还是负面的。
请解释这种方法是否更好,如果是,如何改进,否则需要包含其他语音标签以改进功能集。
请参阅以下代码
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import nltk
movie_reviews.categories()
# ['pos','neg']
# the regextokenier is used to tokenize the words
tokenizer=RegexpTokenizer(r'\w+')
# creating documents based on filtration of stopwords for each review and running a tokenizer on each document
documents=[(tokenizer.tokenize(' '.join(set(i for i in movie_reviews.words(fileid))-set(stopwords.words()))),category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids()
]
import random
random.shuffle(documents)
#each document contains words that are not in stopwords and punctuations.
for i in documents[:5]:
temp=nltk.FreqDist([j.lower() for j in i[0]])
print(temp.most_common(5),i[1])
#output of 5 documents
#[('vampires', 1), ('clever', 1), ('interesting', 1), ('sunlight', 1), ('partners', 1)] neg
#[('family', 1), ('nino', 1), ('friends', 1), ('acting', 1), ('higher', 1)] pos
#[('inconsistent', 1), ('eye', 1), ('yes', 1), ('interesting', 1), ('praise', 1)] neg
#[('acting', 1), ('science', 1), ('bucks', 1), ('huge', 1), ('terrific', 1)] pos
#[('acting', 1), ('shielded', 1), ('somewhere', 1), ('think', 1), ('touched', 1)] neg
#generate a list called 'all_words' that contains all the set of words that have appeared so far
all_words=tokenizer.tokenize(' '.join(set(i for i in movie_reviews.words())-set(stopwords.words())))
freqdist=nltk.FreqDist(all_words)
#create a list of all verbs for each word appearing in 'all_words'
verb=[]
pos_=nltk.pos_tag(all_words)
#print([i[1] for i in pos_])
for i in pos_:
if i[1] in ['VB','VBG','VBN','VBZ','VBD','VBP']:
verb.append(i[0])
#document - feature set, build a dictionary of verbs for each document
def documentFeature(document):
feature={}
for i in verb:
feature['contains({0})'.format(i)]=(i in document)
return feature
#build a naive bayes classifier
featureSet=[(documentFeature(d),c) for d,c in documents]
trainSet,testSet=featureSet[100:], featureSet[:100]
classifier=nltk.NaiveBayesClassifier.train(trainSet)
print(nltk.classify.accuracy(classifier, testSet))
#0.03 a very poor accuracy on the testset
目前我的准确度为0.03,请帮助我提高准确度。