如何为数据集中的所有文本文件而不是少量文本文件运行朴素贝叶斯算法

时间:2016-12-30 20:04:47

标签: python-3.x sentiment-analysis naivebayes

您好我已经使用bigram为naïvebayes算法编写了一个python代码。我的数据集包含一堆文本文件。这段代码可以正常处理一些文本文件但不适用于所有文件。如何改进我的代码所以它将成功用于所有文本文件。我已在下面发布了我的代码。

from __future__ import division
import os
import nltk.classify.util
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import random
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics.scores import precision , recall , f_measure
import itertools ,collections
negfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/train/neg")
print('train negative:' ,negfilenames)
neg_reviews = []
for filename in negfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/train/neg",filename),'r',encoding='UTF8')
    neg_reviews = f.read().split()
print('train negative:' ,neg_reviews)
print('train negative:',len(neg_reviews))
posfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/train/pos")
print('train positive:' ,posfilenames)
pos_reviews = []
for filename in posfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/train/pos",filename),'r',encoding='UTF8')
    pos_reviews = f.read().split()
print('train positive:' ,pos_reviews)
print('train positive:' ,len(pos_reviews))
testposfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/test/pos")
print('test positive:' ,testposfilenames)
test_pos_reviews = []
for filename in testposfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/test/pos",filename),'r',encoding='UTF8')
    test_pos_reviews = f.read().split()
print('test positive:' ,test_pos_reviews)
print('test positive:' ,len(test_pos_reviews))
testnegfilenames = os.listdir("C:/Users/Sharmili/Desktop/movie_reviews/test/neg")
print('test negative:' ,testnegfilenames)
test_neg_reviews = []
for filename in testnegfilenames:
    f = open(os.path.join("C:/Users/Sharmili/Desktop/movie_reviews/test/neg",filename),'r',encoding='UTF8')
test_neg_reviews = f.read().split()
print('test negative:' ,test_neg_reviews)
print('test negative:' ,len(test_neg_reviews))

def word_split(data):
    data_new = []
    for word in data:
        word_filter = [i.lower() for i in word.split()]
        data_new.append(word_filter)
    return data_new
stopset = set(stopwords.words('english')) - set(('over', 'under', 'below', 'more', 'most', 'no', 'not', 'only', 'such', 'few', 'so', 'too', 'very', 'just', 'any', 'once'))

def bigram_word_feats(words , score_fn = BigramAssocMeasures.chi_sq , n= 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn , n)
    return (dict([(ngram,True) for ngram in itertools.chain(words,bigrams) if ngram not in stopset]))
def evaluate_classifier(bigram_word_feats):
    negtrnfeats = [(bigram_word_feats(f),'negative') for f in neg_reviews]
    postrnfeats = [(bigram_word_feats(f),'positive')for f in pos_reviews]
    negtestfeats = [(bigram_word_feats(f),'negative')for f in test_neg_reviews]
    postestfeats = [(bigram_word_feats(f),'positive')for f in test_pos_reviews]
    trainfeats = postrnfeats + negtrnfeats
    testfeats = negtestfeats + postestfeats
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (features,label) in enumerate(testfeats):
       refsets[label].add(i)
       predicted = classifier.classify(features)
       testsets[predicted].add(i)
    accuracy = nltk.classify.util.accuracy(classifier, testfeats)
    pos_precision = precision(refsets['positive'], testsets['positive'])
    pos_recall = recall(refsets['positive'], testsets['positive'])
    pos_fmeasure = f_measure(refsets['positive'], testsets['positive'])
    neg_precision = precision(refsets['negative'], testsets['negative'])
    neg_recall = recall(refsets['negative'], testsets['negative'])
    neg_fmeasure = f_measure(refsets['negative'], testsets['negative'])
    print('accuracy:', accuracy)
    print('precision', (pos_precision + neg_precision) / 2)
    print('recall', (pos_recall + neg_recall) / 2)
    print('f-measure', (pos_fmeasure + neg_fmeasure) / 2)
evaluate_classifier(bigram_word_feats)

0 个答案:

没有答案