Question

我是nltk训练分类器的新手，所以我正在尝试在电影评论语料库中训练NaiveBayesClassifier，但是注意到它将负面特征集错误标记为正面，观察：

 def bag_of_words(words):
   return dict([(word, True) for word in words])

 def label_feats_from_corpus(corp, feature_detector=bag_of_words):
   label_feats = collections.defaultdict(list)
   for label in corp.categories():
     for fileid in corp.fileids(categories=[label]):
     feats = feature_detector(corp.words(fileids=[fileid]))
     label_feats[label].append(feats)
   return label_feats

 def split_label_feats(lfeats, split=0.75):
   train_feats = []
   test_feats = []
   for label, feats in lfeats.iteritems():
     cutoff = int(len(feats) * split)
     train_feats.extend([(feat, label) for feat in feats[:cutoff]])
     test_feats.extend([(feat, label) for feat in feats[cutoff:]])
   return train_feats, test_feats

 >>> from nltk.corpus import movie_reviews
 >>> from featx import label_feats_from_corpus, split_label_feats
 >>> movie_reviews.categories()
 ['neg', 'pos']
 >>> lfeats = label_feats_from_corpus(movie_reviews)
 >>> lfeats.keys()
 ['neg', 'pos']
 >>> train_feats, test_feats = split_label_feats(lfeats)
 >>> len(train_feats)
 750
 >>> len(test_feats)
 250
 >>> from nltk.classify import NaiveBayesClassifier
 >>> nb_classifier = NaiveBayesClassifier.train(train_feats)
 >>> nb_classifier
 <nltk.classify.naivebayes.NaiveBayesClassifier object at 0x7f1127b50510>
 >>> nb_classifier.labels()
 ['pos']
 >>> from featx import bag_of_words
 >>> negfeat = bag_of_words(['the', 'plot', 'was', 'ludicrous'])
 >>> nb_classifier.classify(negfeat)
 'pos'
 >>> posfeat = bag_of_words(['kate', 'winslet', 'is', 'accessible'])
 >>> nb_classifier.classify(posfeat)
 'pos'

  Why does the 'neg' label not show up when I call the labels function, and it labels the positive feature set as 'pos', so how can I change my code so that it labels the negative feature set as 'neg'?

如何更准确地标记功能集？

0 个答案: