我目前正在使用NLTK创建自定义语料库,以对Twitter消息进行情绪分析。
我的语料库存在正面和负面的推文。我给了相关文件夹与原始'movie_reviews'文件夹相同的结构:它被称为 my_movies_reviews25K ,子文件夹pos& neg,每个包含25K文本文件,带有1个pos或neg推文。
现在,当我构建和评估这个自定义语料库时,它可以完美地运行,使用以下代码:
#this code creates corpora of my own pos/neg tweets.
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from collections import defaultdict
import numpy as np
import collections
root_folder = 'C:\Users\gerbuiker\Desktop\my_movie_reviews25K'
movie_reviews = CategorizedPlaintextCorpusReader(root_folder, r'.*\.txt', cat_pattern='(\w+)')
movie_reviews.categories()
# define the split of % training / % test
SPLIT = 0.8
def word_feats(words):
return dict([(word, True) for word in words])
posids = movie_reviews.fileids('pos')
negids = movie_reviews.fileids('neg')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
cutoff = int(len(posfeats) * SPLIT)
trainfeats = negfeats[:cutoff] + posfeats[:cutoff]
testfeats = negfeats[cutoff:] + posfeats[cutoff:]
print 'Train on %d instances\nTest on %d instances' % (len(trainfeats),len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'Accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
带输出:
Train on 40000 instances
Test on 10000 instances
Accuracy: 0.7449
Most Informative Features
followfriday = True pos : neg = 161.0 : 1.0
bummed = True neg : pos = 27.7 : 1.0
female = True neg : pos = 22.2 : 1.0
hurts = True neg : pos = 20.5 : 1.0
anywhere = True neg : pos = 19.7 : 1.0
snowing = True neg : pos = 19.0 : 1.0
ff = True pos : neg = 18.1 : 1.0
throat = True neg : pos = 17.2 : 1.0
hurting = True neg : pos = 17.0 : 1.0
essay = True neg : pos = 16.6 : 1.0
pos precision: 0.831393775372
pos recall: 0.6144
pos F-measure: 0.706612995975
neg precision: 0.694210943695
neg recall: 0.8754
neg F-measure: 0.77434763379
为了提高准确性,我想包括双子座。我使用以下代码:
#this code creates corpora of my own pos/neg tweets. Includes bigrams
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
import collections, itertools
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
root_folder = 'C:\Users\gerbuiker\Desktop\Sentiment Analyse\my_movie_reviews25K'
movie_reviews = CategorizedPlaintextCorpusReader(root_folder, r'.*\.txt', cat_pattern='(\w+)')
movie_reviews.categories()
def evaluate_classifier(featx):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
classifier.show_most_informative_features()
def word_feats(words):
return dict([(word, True) for word in words])
print 'evaluating single word features'
evaluate_classifier(word_feats)
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd[word.lower()] += 1
label_word_fd['pos'][word.lower()] += 1
for word in movie_reviews.words(categories=['neg']):
word_fd[word.lower()] += 1
label_word_fd['neg'][word.lower()] += 1
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
def best_word_feats(words):
return dict([(word, True) for word in words if word in bestwords])
print 'evaluating best word features'
evaluate_classifier(best_word_feats)
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
d = dict([(bigram, True) for bigram in bigrams])
d.update(best_word_feats(words))
return d
print 'evaluating best words + bigram chi_sq word features'
evaluate_classifier(best_bigram_word_feats)
但现在我收到以下错误消息:
C:\Users\gerbuiker\Anaconda\python.exe E:/bigrams.py
Traceback (most recent call last):
File "E:/bigrams.py", line 30, in <module>
negfeats = [(bigram_word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
File "E:/bigrams.py", line 24, in bigram_word_feats
bigrams = bigram_finder.nbest(score_fn, n)
File "C:\Users\gerbuiker\AppData\Roaming\Python\Python27\site-packages\nltk\collocations.py", line 112, in nbest
return [p for p, s in self.score_ngrams(score_fn)[:n]]
File "C:\Users\gerbuiker\AppData\Roaming\Python\Python27\site-packages\nltk\collocations.py", line 108, in score_ngrams
return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0]))
File "C:\Users\gerbuiker\AppData\Roaming\Python\Python27\site-packages\nltk\collocations.py", line 100, in _score_ngrams
score = self.score_ngram(score_fn, *tup)
File "C:\Users\gerbuiker\AppData\Roaming\Python\Python27\site-packages\nltk\collocations.py", line 169, in score_ngram
return score_fn(n_ii, (n_ix, n_xi), n_all)
File "C:\Users\gerbuiker\AppData\Roaming\Python\Python27\site-packages\nltk\metrics\association.py", line 220, in chi_sq
return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx)
File "C:\Users\gerbuiker\AppData\Roaming\Python\Python27\site-packages\nltk\metrics\association.py", line 212, in phi_sq
((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)))
ZeroDivisionError: float division by zero
Process finished with exit code 1
任何人都可以帮我吗?
大多数代码来自:http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
和bigram案例:http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/
答案 0 :(得分:1)
我有一个非常相似的情况,这就是我发现的。这不是剧本。它可能是我独特的情况,但你可能发现它很有用。
我将文本分成若干部分以查看是否有任何问题导致问题,因为我的训练数据集工作得很好,但我的测试数据集给出了此错误消息。最后,我发现了一行引起问题的文本,基本上它是一个完全相同的单词,例如“work work”或“hello hello”。
删除该行后,问题就消失了。希望这会有所帮助。
答案 1 :(得分:0)
你好问题在这里你没有更新这行代码
self.var.set(3)
也许你正试图把一个字典或一个列表放在哪里可以一字接一个,希望它有帮助
尝试这个:
label_word_fd['pos'][word.lower()] += 1