属性错误消息

时间:2017-02-14 14:06:17

标签: python

import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
import sys
sys.getdefaultencoding()
import os

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier

from nltk.classify import ClassifierI
from statistics import mode

from nltk.tokenize import word_tokenize

class VoteClassifier(ClassifierI):
    def __int__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = [ ]
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = [ ]
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf


short_pos = os.open("positive.txt", os.O_RDONLY).read()
short_neg = os.open("negative.txt", os.O_RDONLY).read()

documents = [ ]

for r in short_pos.split('\n'):
    documents.append( (r, "pos") )

for r in short_neg.split('\n'):
    documents.append( (r, "neg") )

    all_words = [ ]

    short_pos_words = word_tokenize(short_pos)
    short_neg_words = word_tokenize(short_neg)

    for w in short_pos_words:
        all_words.append(w.lower())

    for w in short_neg_words:
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:5000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words) 

    return features

# print ((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents] random.shuffle(featuresets)

#training with increased data collection 
training_set =  featuresets[:10000]
#testing with increased data Collection
testing_set = featuresets[10000:]

#define and train classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
#testing classifier
#print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
#show the 15 most valuable words when it comes to positive or negative reviews
#classifier.show_most_informative_features(15)
#saving classifier
#save_classifier = open("naivebayes.pickle", "wb")
#pickle.dump(classifier, save_classifier)
#save_classifier.close()

#loading classifier
#classifier_f = open("naivebayes.pickle", "rb")
#classifier = pickle.load(classifier_f)
#classifier_f.close()

print("Original Naive Bayes Alogrithm acurracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
    classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:", nltk.classify.accuracy(MNB_classifier, testing_set))

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:", nltk.classify.accuracy(BernoulliNB_classifier, testing_set))


SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDclassifier accuracy percent:", nltk.classify.accuracy(SGDClassifier_classifier, testing_set))

#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(training_set)
#print("SVC accuracy percent:", nltk.classify.accuracy(SVC_classifier, testing_set))

voted_classifier = VoteClassifier(classifier, 
                            SGDClassifier_classifier,
                            MNB_classifier, 
                            BernoulliNB_classifier )

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, training_set))*100)

#print ("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence%:", voted_classifier.confidence(testing_set[0][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence%:", voted_classifier.confidence(testing_set[1][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence%:", voted_classifier.confidence(testing_set[2][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence%:", voted_classifier.confidence(testing_set[3][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence%:", voted_classifier.confidence(testing_set[4][0])*100)
#print ("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence%:", voted_classifier.confidence(testing_set[5][0])*100)

运行上面的代码时,我收到错误

short_pos = os.open("positive.txt", os.O_RDONLY).read()
AttributeError: 'int' object has no attribute 'read'

为什么会出现此错误,如何防止此错误再次出现?

1 个答案:

答案 0 :(得分:2)

这是因为你试图在.read()的返回值上调用os.open()方法,它返回一个int,而不是一个类似文件的对象。

我认为你打算做的是使用一个简单的

with open('filename.txt', 'r') as f:
    text = f.read()

或者,如果你真的想要一个单行:

text = open('filename.txt', 'r').read()

这两行:

short_pos = os.open("positive.txt", os.O_RDONLY).read()
short_neg = os.open("negative.txt", os.O_RDONLY).read()

应改为:

with open("positive.txt", 'r') as f:
    short_pos = f.read()

with open("negative.txt", 'r') as f:
    short_neg = f.read()

此外,不是读取整个文件的内容,而是按照这样的\n分割它们:

for r in short_pos.split('\n'):    # This .split()
    documents.append( (r, "pos") )

for r in short_neg.split('\n'):    # And this .split()
    documents.append( (r, "neg") )

而不是使用str.split(),最好先使用.readlines()而不是read()来阅读文件。前者将返回文件流中的行列表,您不必担心不同操作系统使用的不同行结束方案。