您好我已经使用bigram为naïvebayes算法编写了一个python代码。我的数据集包含一堆文本文件。这段代码可以正常处理一些文本文件但不适用于所有文件。如何改进我的代码所以它将成功用于所有文本文件。我已在下面发布了我的代码。
from __future__ import division
import os
import nltk.classify.util
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import random
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics.scores import precision , recall , f_measure
import itertools ,collections
negfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/train/neg")
print('train negative:' ,negfilenames)
neg_reviews = []
for filename in negfilenames:
f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/train/neg",filename),'r',encoding='UTF8')
neg_reviews = f.read().split()
print('train negative:' ,neg_reviews)
print('train negative:',len(neg_reviews))
posfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/train/pos")
print('train positive:' ,posfilenames)
pos_reviews = []
for filename in posfilenames:
f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/train/pos",filename),'r',encoding='UTF8')
pos_reviews = f.read().split()
print('train positive:' ,pos_reviews)
print('train positive:' ,len(pos_reviews))
testposfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/test/pos")
print('test positive:' ,testposfilenames)
test_pos_reviews = []
for filename in testposfilenames:
f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/test/pos",filename),'r',encoding='UTF8')
test_pos_reviews = f.read().split()
print('test positive:' ,test_pos_reviews)
print('test positive:' ,len(test_pos_reviews))
testnegfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/test/neg")
print('test negative:' ,testnegfilenames)
test_neg_reviews = []
for filename in testnegfilenames:
f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/test/neg",filename),'r',encoding='UTF8')
test_neg_reviews = f.read().split()
print('test negative:' ,test_neg_reviews)
print('test negative:' ,len(test_neg_reviews))
def word_split(data):
data_new = []
for word in data:
word_filter = [i.lower() for i in word.split()]
data_new.append(word_filter)
return data_new
stopset = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))
def bigram_word_feats(words , score_fn = BigramAssocMeasures.chi_sq , n= 200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn , n)
return (dict([(ngram,True) for ngram in itertools.chain(words,bigrams) if ngram not in stopset]))
def evaluate_classifier(bigram_word_feats):
negtrnfeats = [(bigram_word_feats(f),'negative') for f in neg_reviews]
postrnfeats = [(bigram_word_feats(f),'positive')for f in pos_reviews]
negtestfeats = [(bigram_word_feats(f),'negative')for f in test_neg_reviews]
postestfeats = [(bigram_word_feats(f),'positive')for f in test_pos_reviews]
trainfeats = postrnfeats + negtrnfeats
testfeats = negtestfeats + postestfeats
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (features,label) in enumerate(testfeats):
refsets[label].add(i)
predicted = classifier.classify(features)
testsets[predicted].add(i)
accuracy = nltk.classify.util.accuracy(classifier, testfeats)
pos_precision = precision(refsets['positive'], testsets['positive'])
pos_recall = recall(refsets['positive'], testsets['positive'])
pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
neg_precision = precision(refsets['negative'], testsets['negative'])
neg_recall = recall(refsets['negative'], testsets['negative'])
neg_fmeasure = f_measure(refsets['negative'], testsets['negative'])
print('accuracy:', accuracy)
print('precision', (pos_precision + neg_precision) / 2)
print('recall', (pos_recall + neg_recall) / 2)
print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
evaluate_classifier(bigram_word_feats)