我是python的新手,正在尝试解析朴素贝叶斯算法(多项式模型),但是我的代码出现了一些错误。
from collections import Counter
import nltk
from nltk.corpus import stopwords
def CountDocs(D):
D_length = X_train_counts.shape[0] # number of documents
return D_length
def ExtractVocabulary(D):
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(D.data)
return vectorizer.get_feature_names()
def CountDocsInClass(c):
data_from_class = fetch_20newsgroups(subset='train', categories=[c]).data
return len(data_from_class)
def ConcatenateTextOfAllDocsInClass(data_from_class):
for i in range(0, len(data_from_class)):
if i not in stop_words:
text += word_tokenize[i]
return text
def TrainMultinomialNB(C,D):
V = D.data
N = CountDocs(D)
ClassLength = {}
prior = {}
condprob = {}
for c in C:
class_length = CountDocsInClass(c)
print(data_from_class)
ClassLength.update({c: nc})
prior.update({c:class_length/N})
textC = ConcatenateTextOfAllDocsInClass(c)
T_ct = Counter(textC)
T_ct_sum = sum(T_ct[t] + 1 for t in V)
for t in V:
if t not in condprob:
condprob.update({t: {c: (T_ct[t] + 1)/T_ct_sum}})
else:
condprob[t].update({c: (T_ct[t] + 1)/T_ct_sum})
return V, prior, condprob
TrainMultinomialNB(feature_names, twenty_train)
我在其中应用此算法的数据:
变量feature_names
代表课程,变量twenty_train
代表训练集。
x_train_counts
是上面这段代码的使用,它是:X_train_counts = count_vect.fit_transform(twenty_train.data)
data_from_class变量加载20个Newsgrous数据集-一组12k个文档的集合,分为c个不同类别