class NaiveBayes():
def __init__(self):
self.summaries = dict()
def fit(self, x_train, y_train):
self.total_sample = len(x_train)
classes = self.group_by_class(x_train, y_train)
for class_value, rows in classes.items():
rows = np.array(rows)
std_of_columns = np.std(rows, axis=0)
mean_of_columns = np.mean(rows, axis=0)
number_of_instance = len(rows)
self.summaries[class_value] = (std_of_columns, mean_of_columns, number_of_instance)
def predict_sample(self, sample_to_be_predicted):
predictions = np.zeros(len(sample_to_be_predicted))
for i in range(len(sample_to_be_predicted)):
predictions[i] = self.predict(sample_to_be_predicted[i])
def predict(self, instance_to_be_predicted):
probabilities = self.compute_probabilities(instance_to_be_predicted)
prediction = self.classify(probabilities)
return prediction
def compute_probabilities(self, instance_to_be_predicted):
probabilities = dict()
for label in self.summaries:
std_of_columns, mean_of_columns, number_of_instance = self.summaries[label]
probabilities[label] = math.log(number_of_instance / self.total_sample)
for feature, mean, std in zip(instance_to_be_predicted, mean_of_columns, std_of_columns):
probabilities[label] += math.log(self.calculate_probability(feature, mean, std))
probabilities[label] = exp(probabilities[label])
return probabilities
def classify(self, probabilities):
best_label = max(probabilities, key=probabilities.get)
return best_label
def group_by_class(self, x_train, y_train):
classes = dict()
for i, j in zip(y_train, x_train):
if i not in classes:
classes[i] = [j]
else:
classes[i].append(j)
return classes
def calculate_probability(self, feature, mean, stdev):
exponent = exp(-((feature - mean) ** 2 / (2 * stdev ** 2)))
return (1 / (sqrt(2 * pi) * stdev)) * exponent
我正在尝试使用在calculate_probability 函数中发生的高斯分布函数来实现朴素贝叶斯。但是,如果正在计算的类的 std 为零,我将获得 NaN 值。如果“-((feature - mean) ** 2 / (2 * stdev ** 2))”(calculate_probability 中表达式的一部分)太大,我得到的概率为 0,因为它导致 (e ^ -huge_number) 评估为 0。我到处钻研,但找不到解决方案。拉普拉斯平滑是不可行的,因为我们正在以一种非常不同的方式进行跟踪。那么,我如何才能在要预测的实例中修复为特征(维度)计算为 0 或 NaN 的概率?