def train(cls, labeled_featuresets, estimator=ELEProbDist):
:param labeled_featuresets: A list of classified featuresets,
i.e., a list of tuples ``(featureset, label)``.
label_freqdist = FreqDist()
feature_freqdist = defaultdict(FreqDist)
feature_values = defaultdict(set)
fnames = set()
# Count up how many times each feature value occurred, given
# the label and featurename.
for featureset, label in labeled_featuresets:
label_freqdist[label] += 1
for fname, fval in featureset.items():
# Increment freq(fval|label, fname)
feature_freqdist[label, fname][fval] += 1
# Record that fname can take the value fval.
# Keep a list of all feature names.
# If a feature didn't have a value given for an instance, then
# we assume that it gets the implicit value 'None.' This loop
# counts up the number of 'missing' feature values for each
# (label,fname) pair, and increments the count of the fval
# 'None' by that amount.
for label in label_freqdist:
num_samples = label_freqdist[label]
for fname in fnames:
count = feature_freqdist[label, fname].N()
# Only add a None key when necessary, i.e. if there are
# any samples with feature 'fname' missing.
if num_samples - count > 0:
feature_freqdist[label, fname][None] += num_samples - count
# Create the P(label) distribution
label_probdist = estimator(label_freqdist)
# Create the P(fval|label, fname) distribution
feature_probdist = {}
for ((label, fname), freqdist) in feature_freqdist.items():
probdist = estimator(freqdist, bins=len(feature_values[fname]))
feature_probdist[label, fname] = probdist
return cls(label_probdist, feature_probdist)
答案 0 :(得分:1)
Naive Bayes算法不进行任何特征选择。我不知道"模式"你正在想象当你写那个"由于维数的诅咒,它无法找到任何模式,"但它确实使用了其模型中提供的所有功能。
通过组合每个单独特征的概率估计来估计标签概率,好像这些特征是相互统计独立的(即模型中的"天真"部分) 。这使得构建朴素贝叶斯概率模型变得非常快速和容易。具有更强预测能力的特征(与其中一个结果标签更强烈关联)对计算概率的影响更大。任何标签概率大致相同的特征对估计的影响可以忽略不计。