import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
import sys
sys.getdefaultencoding()
import os
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
class VoteClassifier(ClassifierI):
def __int__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = [ ]
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = [ ]
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
short_pos = os.open("positive.txt", os.O_RDONLY).read()
short_neg = os.open("negative.txt", os.O_RDONLY).read()
documents = [ ]
for r in short_pos.split('\n'):
documents.append( (r, "pos") )
for r in short_neg.split('\n'):
documents.append( (r, "neg") )
all_words = [ ]
short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)
for w in short_pos_words:
all_words.append(w.lower())
for w in short_neg_words:
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:5000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
# print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets)
#training with increased data collection
training_set = featuresets[:10000]
#testing with increased data Collection
testing_set = featuresets[10000:]
#define and train classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
#testing classifier
#print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
#show the 15 most valuable words when it comes to positive or negative reviews
#classifier.show_most_informative_features(15)
#saving classifier
#save_classifier = open("naivebayes.pickle", "wb")
#pickle.dump(classifier, save_classifier)
#save_classifier.close()
#loading classifier
#classifier_f = open("naivebayes.pickle", "rb")
#classifier = pickle.load(classifier_f)
#classifier_f.close()
print("Original Naive Bayes Alogrithm acurracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:", nltk.classify.accuracy(MNB_classifier, testing_set))
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:", nltk.classify.accuracy(BernoulliNB_classifier, testing_set))
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDclassifier accuracy percent:", nltk.classify.accuracy(SGDClassifier_classifier, testing_set))
#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(training_set)
#print("SVC accuracy percent:", nltk.classify.accuracy(SVC_classifier, testing_set))
voted_classifier = VoteClassifier(classifier,
SGDClassifier_classifier,
MNB_classifier,
BernoulliNB_classifier )
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, training_set))*100)
#print ("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence%:", voted_classifier.confidence(testing_set[0][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence%:", voted_classifier.confidence(testing_set[1][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence%:", voted_classifier.confidence(testing_set[2][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence%:", voted_classifier.confidence(testing_set[3][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence%:", voted_classifier.confidence(testing_set[4][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence%:", voted_classifier.confidence(testing_set[5][0])*100)
运行上面的代码时,我收到错误
short_pos = os.open("positive.txt", os.O_RDONLY).read()
AttributeError: 'int' object has no attribute 'read'
为什么会出现此错误,如何防止此错误再次出现?
答案 0 :(得分:2)
这是因为你试图在.read()
的返回值上调用os.open()
方法,它返回一个int,而不是一个类似文件的对象。
我认为你打算做的是使用一个简单的
with open('filename.txt', 'r') as f:
text = f.read()
或者,如果你真的想要一个单行:
text = open('filename.txt', 'r').read()
这两行:
short_pos = os.open("positive.txt", os.O_RDONLY).read()
short_neg = os.open("negative.txt", os.O_RDONLY).read()
应改为:
with open("positive.txt", 'r') as f:
short_pos = f.read()
with open("negative.txt", 'r') as f:
short_neg = f.read()
此外,不是读取整个文件的内容,而是按照这样的\n
分割它们:
for r in short_pos.split('\n'): # This .split()
documents.append( (r, "pos") )
for r in short_neg.split('\n'): # And this .split()
documents.append( (r, "neg") )
而不是使用str.split()
,最好先使用.readlines()
而不是read()
来阅读文件。前者将返回文件流中的行列表,您不必担心不同操作系统使用的不同行结束方案。