'没有独特的模式;发现%d个相同的公用值的%len(table)统计信息。找到2个相同的共同值

时间:2018-11-30 17:50:21

标签: python-3.x

当我使用大量数据时显示此错误:('无唯一模式;找到%d个相同的公共值'%len(table)statistics.StatisticsError:没有唯一模式;找到2个相同的公共值)。但是使用100个数据就可以了。我无法理解任何帮助它都不起作用的原因以及如何解决此错误。

数据链接:https://github.com/YoeriNijs/TweetAnalyzer

代码:

import warnings
warnings.filterwarnings("ignore")

import nltk, random, csv, sys

from nltk.probability import FreqDist, ELEProbDist
from nltk.classify.util import apply_features,accuracy

from nltk.corpus import names
from nltk.tokenize import word_tokenize
import nltk.classify.util
from nltk import NaiveBayesClassifier
from textblob import TextBlob

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB


from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers


    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)



def get_words_in_tweets(tweets):
    all_words = []
    try:
        for (words, sentiment) in tweets:
            all_words.extend(words)
        return all_words

    except Exception as e:
        print(e)



def get_word_features(wordlist):
    wordlist = FreqDist(wordlist)
    word_features = wordlist.keys()
    #print (word_features)
    return word_features


def selectTweets(row):
    tweetWords = []
    words = row[0].split()
    for i in words:
        i = i.lower()
        i = i.strip('@#\'"?,.!')
        tweetWords.append(i)
    row[0] = tweetWords

    if counter <= 120:
        trainTweets.append(row)
        #print(trainTweets)
        #print(('*')*30)

    else:
        testTweets.append(row)
        #print(testTweets)


def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
      features['contains(%s)' % word] = (word in document_words)
    return features 


trainTweets = []
testTweets = []

#csvfile.csv
while True:

    # Ask for filename
    filename =  str(input("> Please enter a filename (.csv): "))

    #Check if filename ends with .csv
    if filename.endswith(".csv"):

        try:

            #Open file
            with open(filename, 'r',encoding='utf-8') as csvfile: 
                reader = csv.reader(csvfile, delimiter=';', quotechar='|')

               #Print succes message
                print ("> File opened successfully!")


                counter = 0
                for row in reader:
                    selectTweets(row)
                    counter += 1

                print (counter,"> Wait a sec for the results...")

                word_features = get_word_features(get_words_in_tweets(trainTweets))      


                training_set = apply_features(extract_features, trainTweets)
                test_training_set=apply_features(extract_features, testTweets)


                classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
                classifier.show_most_informative_features(5)
                print (nltk.classify.util.accuracy(classifier,test_training_set))


                MNB_classifier = SklearnClassifier(MultinomialNB())
                MNB_classifier.train(training_set)
                print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, test_training_set))

                BNB_classifier = SklearnClassifier(BernoulliNB())
                BNB_classifier.train(training_set)
                print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, test_training_set))




                LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
                LogisticRegression_classifier.train(training_set)
                print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_training_set))*100)



                SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
                SGDClassifier_classifier.train(training_set)
                print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_training_set))*100)

                SVC_classifier = SklearnClassifier(SVC())
                SVC_classifier.train(training_set)
                print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_training_set))*100)

                LinearSVC_classifier = SklearnClassifier(LinearSVC())
                LinearSVC_classifier.train(training_set)
                print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_training_set))*100)

                voted_classifier = VoteClassifier(classifier,
                                                  LinearSVC_classifier,
                                                  SGDClassifier_classifier,
                                                  MNB_classifier,
                                                  BNB_classifier,
                                                  LogisticRegression_classifier)
                print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)



                while True:


                    tweet =  str(input("Please enter the text of the tweet you want to analize: "))
                    print (classifier.classify(extract_features(tweet.split())))



                    while True:
                        print
                        repeat =  str(input("> Do you want to check another tweet (y/n)? "))

                        if repeat == "n":
                            print ("Exit program")
                            sys.exit()
                        if repeat != "y":
                            print ("Something went wrong")
                        if repeat == "y":
                            break         

    #If file does not exist, display this"""
        except IOError:
            print ("File does not exist.")

#Else if file does not end with .csv, do this
    else:
        print ("Please open a file that ends with .csv")

显示此错误:

Traceback (most recent call last):
  File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 163, in         <module>
    print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
  File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\util.py", line 87, in accuracy
    results = classifier.classify_many([fs for (fs, l) in gold])
  File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in classify_many
    return [self.classify(fs) for fs in featuresets]
  File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in <listcomp>
    return [self.classify(fs) for fs in featuresets]
  File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 35, in classify
    return mode(votes)
  File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\statistics.py", line 507, in mode

'没有唯一模式;发现%d个共同值'%len(表)    statistics.StatisticsError:没有唯一模式;发现2个相同的常用值

1 个答案:

答案 0 :(得分:0)

解决此问题的最简单方法是将 Python 升级到 3.8 或更高版本。

在 Python 3.7 及更早版本中,可能只有一个数字在整个集合中出现次数最多。如果一个集合包含两个或更多这样的数字,那么 mode 就会变得不确定并返回您得到的确切错误。

然而,从 3.8 版开始,整个数学概念发生了变化。如果集合中有两个或多个模式,则选择最小的模式作为结果。

示例:

result = statistics.mode([1,1,2,2,3,3])

有三种可能且相等的解:123 因为每个数字在集合中出现两次

在 Python 3.7 中这会返回一个错误,

在 Python 3.8 中返回 1 作为模式