我正在尝试在PHP中实现Naive Bayes classifier,我找到了this script。
我在标准的LAMP堆栈(php-fpm)上运行脚本,我收到一个错误:
致命错误:调用未定义的函数3184791920() 第73行/location/file.php
但由于没有3184791920()
功能,我无法弄清楚导致这种情况的原因。我认为它与散列有关:
define("LL_NB_HASH_FUNCTION", "crc32");
以下是我的实施全文:
<?php
global $LL_NB_STOP_WORDS;
$LL_NB_STOP_WORDS = array("a", "about", "above", "...");
define("LL_NB_HASH_FUNCTION", "crc32");// crc32 is the fastest built in hash function.
// $xs is a bunch of "strings" and ys are their labels.
function ll_naivebayes($xs, $ys, $testStrings) {
$topicWords = array();
foreach($xs as $i=>$x) {
if(isset($topicWords[$ys[$i]]))
$topicWords[$ys[$i]] .= $x;
else
$topicWords[$ys[$i]] = $x;
}
$topicWords = _ll_computeWordCounts($topicWords); // get the number of each word, by topic.
$probWordsGivenTopic = array(); // probability of each word in a given topic.
$countTopics = array();
foreach($topicWords as $topicIndex=>$xWordCounts) {
$totalWordsTopic = array_sum($xWordCounts);
$countTopics[$topicIndex] = $total_wordsTopic;
foreach($xCount as $hash=>$count) {
$probWordsGivenTopic[$topicIndex][$hash] = ($count/$totalWordsTopic);
}
}
$probTopics = array(); // probability of a given topic (number of words / total words), i.e., relative frequency of topics in terms of words
foreach($countTopics as $i=>$topicCount) {
$probTopics[$i] = ($topicCount/$totalWords);
}
if(!is_array($testStrings))
$testStrings = array($testStrings);
// process the input testStrings array
$return = array();
foreach($testStrings as $i=>$string) {
$testStringWords = _ll_computeWordCount($string);
$topicsPosterior = array();
foreach($probTopics as $key=>$probTopic) {
$p = $probTopic;
foreach($testStringWords as $hash=>$count) {
if(isset($probWordsGivenTopic[$key][$hash]))
$p *= $probWordsGivenTopic[$key][$hash] * $count;
}
$topicsPosterior[$key] = $p;
}
sort($topicsPosterior);
$return[$i] = $topicsPosterior;
}
return $return;
}
function _ll_computeWordCounts($strings) {
$wcs = array();
foreach($strings as $string) {
$wcs[] = _ll_computeWordCount($string);
}
return $wcs;
}
function _ll_computeWordCount($string) {
$string = trim($string);
$string = explode(' ', $string);
natcasesort($string);
$hash = LL_NB_HASH_FUNCTION;
$words = array();
for($i=0, $count = count($string); $i<$count; $i++) {
$word = trim($string[$i]);
if(preg_match('/[^a-zA-Z\']/', $word))
continue;
$hash = (string) $hash($word);
if(!isset($words[$hash]))
$words[$hash] = 1; //$words[$hash] = array('word'=>$word, 'count'=>1);
else
$words[$hash]++; //$words[$hash]['count']++;
}
return $words;
}
$output = ll_naivebayes(array("Will I marry John", "Marriage is cool", "A string about Windows XP"), array("marriage", "marriage", "windows"), array("this is about marriage"));
?>
答案 0 :(得分:0)
看起来像一个bug看到我在代码中的评论
function _ll_computeWordCount($string) {
$string = trim($string);
$string = explode(' ', $string);
natcasesort($string);
$hash = LL_NB_HASH_FUNCTION; // $hash = crc32
$words = array();
for($i=0, $count = count($string); $i<$count; $i++) {
$word = trim($string[$i]);
if(preg_match('/[^a-zA-Z\']/', $word))
continue;
$hash = (string) $hash($word); // 1st iteration $hash = crc32($word)
//2nd iteration $hash = 2949202($word) - fatal error
if(!isset($words[$hash]))
$words[$hash] = 1; //$words[$hash] = array('word'=>$word, 'count'=>1);
else
$words[$hash]++; //$words[$hash]['count']++;
}
return $words;
}
试试这个
function _ll_computeWordCount($string) {
$string = trim($string);
$string = explode(' ', $string);
natcasesort($string);
$hash_function = LL_NB_HASH_FUNCTION;
$words = array();
for($i=0, $count = count($string); $i<$count; $i++) {
$word = trim($string[$i]);
if(preg_match('/[^a-zA-Z\']/', $word))
continue;
$hash = (string) $hash_function($word);
if(!isset($words[$hash]))
$words[$hash] = 1; //$words[$hash] = array('word'=>$word, 'count'=>1);
else
$words[$hash]++; //$words[$hash]['count']++;
}
return $words;
}