嘿我正在尝试使用Naive Bayes分类器对某些文本进行分类。我正在使用NLTK。每当我使用classify()方法测试分类器时,它总是返回第一个项目的正确分类,并为我分类的每个其他文本行返回相同的分类。以下是我的代码:
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
import nltk
import random
import nltk.data
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def bag_of_words(words):
return dict([word,True] for word in words)
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
text1="i love this city"
text2="i hate this city"
feats1=bag_of_words(word_tokenize(text1))
feats2=bag_of_words(word_tokenize(text2))
print classifier.classify(feats1)
print classifier.classify(feats2)
这段代码会打印pos两次,好像我翻了代码的最后两行,它会打印两次。有人可以帮忙吗?
答案 0 :(得分:4)
更改
features['contains(%s)' % word] = (word in document_words)
到
features[word] = (word in document)
否则分类器只知道“contains(...)”形式的“单词”,因此对"i love this city"
import nltk.tokenize as tokenize
import nltk
import random
random.seed(3)
def bag_of_words(words):
return dict([word, True] for word in words)
def document_features(document):
features = {}
for word in word_features:
features[word] = (word in document)
# features['contains(%s)' % word] = (word in document_words)
return features
movie_reviews = nltk.corpus.movie_reviews
documents = [(set(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
train_set = [(document_features(d), c) for (d, c) in documents[:200]]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features()
for word in ('love', 'hate'):
# No hope in passing the tests if word is not in word_features
assert word in word_features
print('probability {w!r} is positive: {p:.2%}'.format(
w = word, p = classifier.prob_classify({word : True}).prob('pos')))
tests = ["i love this city",
"i hate this city"]
for test in tests:
words = tokenize.word_tokenize(test)
feats = bag_of_words(words)
print('{s} => {c}'.format(s = test, c = classifier.classify(feats)))
产量
Most Informative Features
worst = True neg : pos = 15.5 : 1.0
ridiculous = True neg : pos = 11.5 : 1.0
batman = True neg : pos = 7.6 : 1.0
drive = True neg : pos = 7.6 : 1.0
blame = True neg : pos = 7.6 : 1.0
terrible = True neg : pos = 6.9 : 1.0
rarely = True pos : neg = 6.4 : 1.0
cliches = True neg : pos = 6.0 : 1.0
$ = True pos : neg = 5.9 : 1.0
perfectly = True pos : neg = 5.5 : 1.0
probability 'love' is positive: 61.52%
probability 'hate' is positive: 36.71%
i love this city => pos
i hate this city => neg