我是nltk训练分类器的新手,所以我正在尝试在电影评论语料库中训练NaiveBayesClassifier
,但是注意到它将负面特征集错误标记为正面,观察:
def bag_of_words(words):
return dict([(word, True) for word in words])
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
label_feats = collections.defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def split_label_feats(lfeats, split=0.75):
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
>>> from nltk.corpus import movie_reviews
>>> from featx import label_feats_from_corpus, split_label_feats
>>> movie_reviews.categories()
['neg', 'pos']
>>> lfeats = label_feats_from_corpus(movie_reviews)
>>> lfeats.keys()
['neg', 'pos']
>>> train_feats, test_feats = split_label_feats(lfeats)
>>> len(train_feats)
750
>>> len(test_feats)
250
>>> from nltk.classify import NaiveBayesClassifier
>>> nb_classifier = NaiveBayesClassifier.train(train_feats)
>>> nb_classifier
<nltk.classify.naivebayes.NaiveBayesClassifier object at 0x7f1127b50510>
>>> nb_classifier.labels()
['pos']
>>> from featx import bag_of_words
>>> negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
>>> nb_classifier.classify(negfeat)
'pos'
>>> posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible'])
>>> nb_classifier.classify(posfeat)
'pos'
Why does the 'neg' label not show up when I call the labels function, and it labels the positive feature set as 'pos', so how can I change my code so that it labels the negative feature set as 'neg'?