当我使用大量数据时显示此错误:('无唯一模式;找到%d个相同的公共值'%len(table)statistics.StatisticsError:没有唯一模式;找到2个相同的公共值)。但是使用100个数据就可以了。我无法理解任何帮助它都不起作用的原因以及如何解决此错误。
数据链接:https://github.com/YoeriNijs/TweetAnalyzer
代码:
import warnings
warnings.filterwarnings("ignore")
import nltk, random, csv, sys
from nltk.probability import FreqDist, ELEProbDist
from nltk.classify.util import apply_features,accuracy
from nltk.corpus import names
from nltk.tokenize import word_tokenize
import nltk.classify.util
from nltk import NaiveBayesClassifier
from textblob import TextBlob
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def get_words_in_tweets(tweets):
all_words = []
try:
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
except Exception as e:
print(e)
def get_word_features(wordlist):
wordlist = FreqDist(wordlist)
word_features = wordlist.keys()
#print (word_features)
return word_features
def selectTweets(row):
tweetWords = []
words = row[0].split()
for i in words:
i = i.lower()
i = i.strip('@#\'"?,.!')
tweetWords.append(i)
row[0] = tweetWords
if counter <= 120:
trainTweets.append(row)
#print(trainTweets)
#print(('*')*30)
else:
testTweets.append(row)
#print(testTweets)
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
trainTweets = []
testTweets = []
#csvfile.csv
while True:
# Ask for filename
filename = str(input("> Please enter a filename (.csv): "))
#Check if filename ends with .csv
if filename.endswith(".csv"):
try:
#Open file
with open(filename, 'r',encoding='utf-8') as csvfile:
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
#Print succes message
print ("> File opened successfully!")
counter = 0
for row in reader:
selectTweets(row)
counter += 1
print (counter,"> Wait a sec for the results...")
word_features = get_word_features(get_words_in_tweets(trainTweets))
training_set = apply_features(extract_features, trainTweets)
test_training_set=apply_features(extract_features, testTweets)
classifier = nltk.classify.NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features(5)
print (nltk.classify.util.accuracy(classifier,test_training_set))
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, test_training_set))
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, test_training_set))
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_training_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_training_set))*100)
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_training_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_training_set))*100)
voted_classifier = VoteClassifier(classifier,
LinearSVC_classifier,
SGDClassifier_classifier,
MNB_classifier,
BNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
while True:
tweet = str(input("Please enter the text of the tweet you want to analize: "))
print (classifier.classify(extract_features(tweet.split())))
while True:
print
repeat = str(input("> Do you want to check another tweet (y/n)? "))
if repeat == "n":
print ("Exit program")
sys.exit()
if repeat != "y":
print ("Something went wrong")
if repeat == "y":
break
#If file does not exist, display this"""
except IOError:
print ("File does not exist.")
#Else if file does not end with .csv, do this
else:
print ("Please open a file that ends with .csv")
显示此错误:
Traceback (most recent call last):
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 163, in <module>
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier,test_training_set ))*100)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\util.py", line 87, in accuracy
results = classifier.classify_many([fs for (fs, l) in gold])
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in classify_many
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\site-packages\nltk\classify\api.py", line 77, in <listcomp>
return [self.classify(fs) for fs in featuresets]
File "C:\Users\Nahid\Desktop\main folder\newcheck.py", line 35, in classify
return mode(votes)
File "C:\Users\Nahid\AppData\Local\Programs\Python\Python36-32\lib\statistics.py", line 507, in mode
'没有唯一模式;发现%d个共同值'%len(表) statistics.StatisticsError:没有唯一模式;发现2个相同的常用值
答案 0 :(得分:0)
解决此问题的最简单方法是将 Python 升级到 3.8 或更高版本。
在 Python 3.7 及更早版本中,可能只有一个数字在整个集合中出现次数最多。如果一个集合包含两个或更多这样的数字,那么 mode 就会变得不确定并返回您得到的确切错误。
然而,从 3.8 版开始,整个数学概念发生了变化。如果集合中有两个或多个模式,则选择最小的模式作为结果。
示例:
result = statistics.mode([1,1,2,2,3,3])
有三种可能且相等的解:1、2 或 3 因为每个数字在集合中出现两次
在 Python 3.7 中这会返回一个错误,
在 Python 3.8 中返回 1 作为模式