MaxEnt分类器每次返回相同的概率 - NLTK

时间:2015-12-04 12:40:16

标签: python nltk sentiment-analysis

我正在尝试使用NLTK库的maxent分类器。我有一个正面和负面的单词列表,我已经训练了分类器。问题是当我针对一个句子测试分类器时,我总是得到两个类的相同分类概率。这是代码 -

import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from sklearn import cross_validation
nltk.data.path.append("/home/daksh/Documents/Softwares/nltk_data")
import csv
import operator

from nltk.classify import MaxentClassifier
from nltk.corpus import movie_reviews


def getBestWords(posWords,negWords):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        label_word_fd['pos'][word.lower()] += 1

    for word in negWords:
        word_fd[word.lower()] += 1
        label_word_fd['neg'][word.lower()] += 1

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                               (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                               (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score


    sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1),reverse=True)[:2500]
    bestwords = set([w for w,s in sorted_x])

    return bestwords

def best_word_feats(words,bestwords):
    return dict([(word, True) for word in words if word in bestwords])

def word_feats(words):
    return dict([(word, True) for word in words])

def best_bigram_word_feats(words,posWords,negWords, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    bestwords = getBestWords(posWords,negWords)
    d.update(best_word_feats(words,bestwords))
    return d

posWords = list()
negWords = list()
with open('../data/finalSentiPosWords.csv','r') as csvfile:
    spamreader = csv.reader(csvfile)
    posWords = list(spamreader)

with open('../data/finalSentiNegWords.csv','r') as csvfile:
    spamreader = csv.reader(csvfile)
    negWords = list(spamreader)

posWords = [word[0] for word in posWords]
negWords = [word[0] for word in negWords]

bestwords = getBestWords(posWords,negWords)

posfeats = [(best_bigram_word_feats(posWords,posWords,negWords),'pos')]
negfeats = [(best_bigram_word_feats(negWords,posWords,negWords),'neg')]


trainfeats = negfeats + posfeats

algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[0]
classifier = nltk.MaxentClassifier.train(trainfeats, algorithm,max_iter=5)
# classifier = nltk.NaiveBayesClassifier.train(trainfeats)
classifier.show_most_informative_features(10)
sentence = "Dosa had a tangy taste but it was fun eating it. On the other hand the other dosa was soggy"
l = sentence.split(' ')
print(l)
print(word_feats(l))
print(classifier.prob_classify(word_feats(l)).prob('pos'))
print(classifier.prob_classify(word_feats(l)).prob('neg'))

这是 -

的输出
0.500074231063
0.499925768937

整体分类似乎运作良好但我无法弄清楚概率是如何计算的,为什么即使我改变了测试句也总是相同。

任何快速帮助表示赞赏。

感谢。

1 个答案:

答案 0 :(得分:0)

这是很多代码!我不会为你调试它,但我注意到<SCRIPT> $("#accordion > li").click(function(){ if(false == $(this).next().is(':visible')) { $('#accordion > ul').slideUp(300); } $(this).next().slideToggle(300); }); $('#accordion > ul:eq(0)').show(); </SCRIPT>` 是训练语料库中所有单词的集合。如果这不是完全错误的话,那肯定是误导性的。