我遇到了与here所示相同的问题。但他们的解决方案似乎并不适合我。不确定是否有人可以帮助我。感谢。
from SentimentAnalyzer import TweetTokenizer
from SentimentAnalyzer import DataSet
import json
import re
import collections
import nltk.metrics
import nltk.classify
import pickle
tweetsTokenizer = TweetTokenizer()
featureList = []
tweets = []
dataset = DataSet()
train_data = dataset.getTrainData()
test_data = dataset.getTestData()
def extract_features(tweet):
tweet_words = set(tweet)
features = {}
for word in featureList:
features['contains(%s)' % word] = (word in tweet_words)
return features
trainsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
NBClassifier = None
train = True
if train:
... some preprocessing codes above ...
# Generate the training set
print 'Extracting features...'
training_set = nltk.classify.util.apply_features(extract_features, tweets)
# Train the Naive Bayes classifier
print 'Training dataset...'
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)
print 'Saving model...'
f = open('NaiveBayesClassifier.pickle', 'wb')
pickle.dump(NBClassifier, f)
f.close()
else:
f = open('NaiveBayesClassifier.pickle', 'rb')
NBClassifier = pickle.load(f)
f.close()
# Test the classifier
print 'Testing the model...'
for i, line in enumerate(test_data):
tweetJson = json.loads(line)
labelledSentiment = dataset.getTestSentiment(tweetJson['id_str']).encode('utf-8')
trainsets[labelledSentiment].add(i)
testTweet = tweetJson['text'].encode('utf-8')
processedTestTweet = tweetsTokenizer.preprocess(testTweet)
sentiment = NBClassifier.classify(extract_features(tweetsTokenizer.getFeatureVector(processedTestTweet)))
testsets[sentiment].add(i)
print "testTweet = %s, classified sentiment = %s, labelled sentiment = %s\n" % (testTweet, sentiment, labelledSentiment)
# print "testTweet = %s, classified sentiment = %s, labelled sentiment = %s\n" % (testTweet, sentiment, labelledSentiment)
print 'Positive precision:', nltk.metrics.precision(trainsets['positive'], testsets['positive'])
print 'Positive recall:', nltk.metrics.recall(trainsets['positive'], testsets['positive'])
print 'Positive F-measure:', nltk.metrics.f_measure(trainsets['positive'], testsets['positive'])
print 'Negative precision:', nltk.metrics.precision(trainsets['negative'], testsets['negative'])
print 'Negative recall:', nltk.metrics.recall(trainsets['negative'], testsets['negative'])
print 'Negative F-measure:', nltk.metrics.f_measure(trainsets['negative'], testsets['negative'])
print 'Neutral precision:', nltk.metrics.precision(trainsets['neutral'], testsets['neutral'])
print 'Neutral recall:', nltk.metrics.recall(trainsets['neutral'], testsets['neutral'])
print 'Neutral F-measure:', nltk.metrics.f_measure(trainsets['neutral'], testsets['neutral'])
print 'done'
与未经过培训直接加载的分类器相比,经过训练和测试的分类器给出了不同的结果。我无法弄清楚为什么。感谢。