from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
spam_emails = open("spam.txt").read().split()
non_spam = open("nonspam.txt").read().split()
test_email = open("test.txt").read().split()
[x.lower() for x in spam_emails]
[x.lower() for x in non_spam]
[x.lower() for x in test_email]
counter = CountVectorizer()
counter.fit(spam_emails + non_spam)
test_counts = counter.transform(test_email)
training_counts = counter.transform(spam_emails + non_spam)
classifier = MultinomialNB()
training_labels = [0] * 3986 + [1] * 3721
classifier.fit(training_counts, training_labels)
print(classifier.predict(test_counts))
print(classifier.predict_proba(test_counts))
我正在尝试使用sklearn制作朴素的贝叶斯定理。我将数据保存为.txt文件,每次运行它时,test_counts的predict_proba都会为每个单词而不是整个电子邮件生成概率。我该怎么做才能预测整个电子邮件的概率,而不是每个单词的概率?