我目前正在开发垃圾邮件过滤器,以使用贝叶斯垃圾邮件过滤方法http://en.wikipedia.org/wiki/Bayesian_spam_filtering检测用于网上诱骗的表单。
它可以通过扫描表单上的文本并计算每个单词是垃圾邮件的概率,然后结合这些概率来计算用于网络钓鱼的表单的总体百分比。
我失败的地方是将个体概率结合起来以产生总体百分比。此外,我已经在更大的形式上对其进行了测试,最终产生的概率为科学记数法的浮点数,超过10个零位。我怎样才能将这些变成百分比?也许我对贝叶斯方法的实现不正确?
以下是大型表单的代码示例:
//Probability of each word being on a phishing form
$words = array(
'euro' => 0.36,
'million' => 0.57,
'international' => 0.33,
'award' => 0.23,
'europe' => 0.48,
'america' => 0.36,
'private' => 0.25,
'egames' => 0.90,
'organizers' => 0.08,
'cosponsors' => 0.69,
'officially' => 0.25,
'bring' => 0.16,
'your' => 0.38,
'notice' => 0.21,
'final' => 0.15,
'draw' => 0.40,
'result' => 0.16,
'egame' => 0.71,
'which' => 0.15,
'conducted' => 0.20,
'complex' => 0.06,
'corporate' => 0.14,
'office' => 0.14,
'spain' => 0.47,
'wish' => 0.15,
'congratulate' => 0.77,
'inform' => 0.24,
'selection' => 0.10,
'coupon' => 0.20,
'number' => 0.29,
'selected' => 0.21,
'among' => 0.27,
'lucky' => 0.70,
'consolation' => 0.36,
'prize' => 0.53,
'winners' => 0.24,
'identified' => 0.07,
'noemeggb' => 0.68,
'random' => 0.32,
'system' => 0.41,
'erss' => 0.68,
'entries' => 0.16,
'different' => 0.20,
'addresses' => 0.13,
'through' => 0.16,
'internet' => 0.36,
'included' => 0.06,
'submitted' => 0.14,
'partner' => 0.22,
'provider' => 0.17,
'companies' => 0.48,
'have' => 0.21,
'cash' => 0.54,
'eight' => 0.35,
'hundred' => 0.58,
'thousand' => 0.63,
'euros' => 0.52,
'only' => 0.25,
'approved' => 0.27,
'payout' => 0.49,
'renumerated' => 0.68,
'directly' => 0.16,
'official' => 0.27,
'payment' => 0.30,
'agency' => 0.39,
'board' => 0.13,
'process' => 0.20,
'unit' => 0.19,
'render' => 0.29,
'complete' => 0.23,
'assistance' => 0.20,
'provide' => 0.13,
'additional' => 0.11,
'information' => 0.25,
'processes' => 0.13,
'claims' => 0.12,
'consultation' => 0.07,
'validate' => 0.70,
'full' => 0.36,
'address' => 0.28,
'mobile' => 0.32,
'phone' => 0.27,
'date' => 0.41,
'birth' => 0.47,
'occupation' => 0.41,
'marital' => 0.36,
'status' => 0.32,
'enter' => 0.55,
'message' => 0.28,
'shown' => 0.44,
'choose' => 0.38,
'mode' => 0.28,
'received' => 0.16,
'winning' => 0.49,
'scanned' => 0.19,
'copy' => 0.29,
'passport' => 0.43,
'photograph' => 0.07,
'proof' => 0.23,
'reference' => 0.11,
'keyfrmadesp' => 0.68,
'batch' => 0.42,
'eumlserial' => 0.50,
'pmsq' => 0.68,
'ticket' => 0.26,
'nmky' => 0.68,
'nkky' => 0.68,
'assured' => 0.12,
'utmost' => 0.10,
'standards' => 0.11,
'confidentiality' => 0.15,
'press' => 0.19,
'anonymity' => 0.54,
'until' => 0.22,
'proceedings' => 0.07,
'yours' => 0.31,
'faithfully' => 0.41,
'gomez' => 0.56,
'zonal' => 0.69,
'coordinator' => 0.09,
'claim' => 0.47,
'agent' => 0.41,
'officer' => 0.22,
);
//Combine the word probabilities
$p = 1;
$sum = 1;
foreach ($words as $word => $score) {
$p *= $score;
$sum *= (1 - $score);
}
$probability = $p / ($p + $sum);
Probability form is spam = float(1.4948061676356E-46)
我是否正确计算了这个?概率真的应该有那么多的零位吗?如何将此数字转换为表单用于网络钓鱼的百分比?
由于