#Load libraries
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#Create text
text_data = np.array(['Tim is smart!',
'Joy is the best',
'Lisa is dumb',
'Fred is lazy',
'Lisa is lazy'])
#Create target vector
y = np.array([1,1,0,0,0])
#feature extraction
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(text_data)
mnb = MultinomialNB(alpha = 1, fit_prior = True, class_prior = None)
mnb.fit(X,y)
print(tfidf_vectorizer.get_feature_names())
#output
['best', 'dumb', 'fred', 'is', 'joy', 'lazy', 'lisa', 'smart', 'the', 'tim']
print(mnb.class_log_prior_)
#output
[-0.51082562 -0.91629073]
print(mnb.feature_log_prob_)
#output
[[-2.70956132 -2.16160272 -2.16160272 -1.97680838 -2.70956132 -1.90253959
-1.90253959 -2.70956132 -2.70956132 -2.70956132]
[-2.1671355 -2.60968347 -2.60968347 -2.14937805 -2.1671355 -2.60968347
-2.60968347 -2.09680488 -2.1671355 -2.09680488]]
X_log_proba = mnb.predict_log_proba(tfidf_vectorizer.transform(["best"]))
print(mnb.predict(tfidf_vectorizer.transform(text_)))
#output
[1]
print(X_log_proba)
#output
[[-0.76397049 -0.62700977]]
我正在尝试计算 X_log_proba 的值。据我所知,这个值来自日志先验类+类中特征的日志。 但例如,要预测仅包含 -2.167 的“最佳”单词的文本,它会给出 [[-0.76397049 -0.62700977]]。
但是当我尝试将特征“最佳”(-2.167)的对数概率添加到对数先验类(-0.91629073)时,结果不是-0.627。 有人能解释一下模型如何通过给定文本在此示例中仅包含单词“best”来给出 -0.627 的预测对数概率?