我的代码有一些问题,当我运行它时显示一些错误。我正在使用python 所以,这是我的代码
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import SklearnClassifier
import csv
from sklearn import cross_validation
from sklearn.svm import LinearSVC, SVC
import random
from nltk.corpus import stopwords
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
posdata = []
with open('positive-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
posdata.append(val[0])
negdata = []
with open('negative-data.csv', 'rb') as myfile:
reader = csv.reader(myfile, delimiter=',')
for val in reader:
negdata.append(val[0])
def word_split(data):
data_new = []
for word in data:
word_filter = [i.lower() for i in word.split()]
data_new.append(word_filter)
return data_new
def word_split_sentiment(data):
data_new = []
for (word, sentiment) in data:
word_filter = [i.lower() for i in word.split()]
data_new.append((word_filter, sentiment))
return data_new
def word_feats(words):
return dict([(word, True) for word in words])
stopset = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))
def stopword_filtered_word_feats(words):
return dict([(word, True) for word in words if word not in stopset])
def bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
def bigram_word_feats_stopwords(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
"""
print words
for ngram in itertools.chain(words, bigrams):
if ngram not in stopset:
print ngram
exit()
"""
return dict([(ngram, True) for ngram in itertools.chain(words, bigrams) if ngram not in stopset])
# Calculating Precision, Recall & F-measure
def evaluate_classifier(featx):
negfeats = [(featx(f), 'neg') for f in word_split(negdata)]
posfeats = [(featx(f), 'pos') for f in word_split(posdata)]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = 'svm'
for cl in classifier:
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats)
pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
print ''
print '---------------------------------------'
print 'SINGLE FOLD RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', accuracy
print 'precision', (pos_precision + neg_precision) / 2
print 'recall', (pos_recall + neg_recall) / 2
print 'f-measure', (pos_fmeasure + neg_fmeasure) / 2
print ''
## CROSS VALIDATION
trainfeats = negfeats + posfeats
# SHUFFLE TRAIN SET
random.shuffle(trainfeats)
n = 5
for cl in classifier_list:
subset_size = len(trainfeats) / n
accuracy = []
pos_precision = []
pos_recall = []
neg_precision = []
neg_recall = []
pos_fmeasure = []
neg_fmeasure = []
cv_count = 1
for i in range(n):
testing_this_round = trainfeats[i*subset_size:][:subset_size]
training_this_round = trainfeats[:i*subset_size] + trainfeats[(i+1)*subset_size:]
if cl == 'svm':
classifierName = 'SVM'
classifier = SklearnClassifier(LinearSVC(), sparse=False)
classifier.train(training_this_round)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testing_this_round):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
cv_accuracy = nltk.classify.util.accuracy(classifier, testing_this_round)
cv_pos_precision = nltk.metrics.precision(refsets['pos'], testsets['pos'])
cv_pos_recall = nltk.metrics.recall(refsets['pos'], testsets['pos'])
cv_pos_fmeasure = nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
cv_neg_precision = nltk.metrics.precision(refsets['neg'], testsets['neg'])
cv_neg_recall = nltk.metrics.recall(refsets['neg'], testsets['neg'])
cv_neg_fmeasure = nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
accuracy.append(cv_accuracy)
pos_precision.append(cv_pos_precision)
pos_recall.append(cv_pos_recall)
neg_precision.append(cv_neg_precision)
neg_recall.append(cv_neg_recall)
pos_fmeasure.append(cv_pos_fmeasure)
neg_fmeasure.append(cv_neg_fmeasure)
cv_count += 1
print '---------------------------------------'
print 'N-FOLD CROSS VALIDATION RESULT ' + '(' + classifierName + ')'
print '---------------------------------------'
print 'accuracy:', sum(accuracy) / n
print 'precision', (sum(pos_precision)/n + sum(neg_precision)/n) / 2
print 'recall', (sum(pos_recall)/n + sum(neg_recall)/n) / 2
print 'f-measure', (sum(pos_fmeasure)/n + sum(neg_fmeasure)/n) / 2
print ''
evaluate_classifier(word_feats)`
它假设使用SVM来分析来自csv文档的情绪,但是当我运行代码时我得到了这个错误anyone have an idea to fix it??
真的需要你帮助的人