PHP中的朴素贝叶斯

时间:2014-05-10 04:45:49

标签: php machine-learning classification

我正在尝试在PHP中实现Naive Bayes classifier,我找到了this script

我在标准的LAMP堆栈(php-fpm)上运行脚本,我收到一个错误:

  

致命错误:调用未定义的函数3184791920()   第73行/location/file.php

但由于没有3184791920()功能,我无法弄清楚导致这种情况的原因。我认为它与散列有关:

define("LL_NB_HASH_FUNCTION", "crc32");

以下是我的实施全文:

<?php

global $LL_NB_STOP_WORDS;
$LL_NB_STOP_WORDS = array("a", "about", "above", "...");
define("LL_NB_HASH_FUNCTION", "crc32");// crc32 is the fastest built in hash function.

// $xs is a bunch of "strings" and ys are their labels.
function ll_naivebayes($xs, $ys, $testStrings) {
   $topicWords = array();
   foreach($xs as $i=>$x) {
      if(isset($topicWords[$ys[$i]]))
         $topicWords[$ys[$i]] .= $x;
      else
         $topicWords[$ys[$i]] = $x;
   }
   $topicWords = _ll_computeWordCounts($topicWords);   // get the number of each word, by topic.

   $probWordsGivenTopic = array();   // probability of each word in a given topic.
   $countTopics = array();

   foreach($topicWords as $topicIndex=>$xWordCounts) {
      $totalWordsTopic = array_sum($xWordCounts);
      $countTopics[$topicIndex] = $total_wordsTopic;

      foreach($xCount as $hash=>$count) {
         $probWordsGivenTopic[$topicIndex][$hash] = ($count/$totalWordsTopic);
      }
   }

   $probTopics = array(); // probability of a given topic (number of words / total words), i.e., relative frequency of topics in terms of words
   foreach($countTopics as $i=>$topicCount) {
      $probTopics[$i] = ($topicCount/$totalWords);
   }

   if(!is_array($testStrings))
      $testStrings = array($testStrings);

   // process the input testStrings array
   $return = array();
   foreach($testStrings as $i=>$string) {
      $testStringWords = _ll_computeWordCount($string);
      $topicsPosterior = array();

      foreach($probTopics as $key=>$probTopic) {
         $p = $probTopic;

         foreach($testStringWords as $hash=>$count) {
            if(isset($probWordsGivenTopic[$key][$hash]))
               $p *= $probWordsGivenTopic[$key][$hash] * $count;
         }
         $topicsPosterior[$key] = $p;
      }
      sort($topicsPosterior);
      $return[$i] = $topicsPosterior;
   }
   return $return;
}

function _ll_computeWordCounts($strings) {
   $wcs = array();
   foreach($strings as $string) {
      $wcs[] = _ll_computeWordCount($string);
   }
   return $wcs;
}

function _ll_computeWordCount($string) {
      $string = trim($string);
      $string = explode(' ', $string);
      natcasesort($string);
      $hash = LL_NB_HASH_FUNCTION;

      $words = array();
      for($i=0, $count = count($string); $i<$count; $i++) {
         $word = trim($string[$i]);
         if(preg_match('/[^a-zA-Z\']/', $word))
            continue;

         $hash = (string) $hash($word);
         if(!isset($words[$hash]))
            $words[$hash] = 1; //$words[$hash] = array('word'=>$word, 'count'=>1);
         else
            $words[$hash]++; //$words[$hash]['count']++;
      }

      return $words;
}

$output = ll_naivebayes(array("Will I marry John", "Marriage is cool", "A string about Windows XP"), array("marriage", "marriage", "windows"), array("this is about marriage"));

?>

1 个答案:

答案 0 :(得分:0)

看起来像一个bug看到我在代码中的评论

function _ll_computeWordCount($string) {
  $string = trim($string);
  $string = explode(' ', $string);
  natcasesort($string);
  $hash = LL_NB_HASH_FUNCTION; // $hash = crc32

  $words = array();
  for($i=0, $count = count($string); $i<$count; $i++) {
     $word = trim($string[$i]);
     if(preg_match('/[^a-zA-Z\']/', $word))
        continue;

     $hash = (string) $hash($word); // 1st iteration $hash = crc32($word)
     //2nd iteration $hash = 2949202($word) - fatal error

     if(!isset($words[$hash]))
        $words[$hash] = 1; //$words[$hash] = array('word'=>$word, 'count'=>1);
     else
        $words[$hash]++; //$words[$hash]['count']++;
  }

  return $words;
}

试试这个

function _ll_computeWordCount($string) {
  $string = trim($string);
  $string = explode(' ', $string);
  natcasesort($string);
  $hash_function = LL_NB_HASH_FUNCTION;

  $words = array();
  for($i=0, $count = count($string); $i<$count; $i++) {
     $word = trim($string[$i]);
     if(preg_match('/[^a-zA-Z\']/', $word))
        continue;

     $hash = (string) $hash_function($word);

     if(!isset($words[$hash]))
        $words[$hash] = 1; //$words[$hash] = array('word'=>$word, 'count'=>1);
     else
        $words[$hash]++; //$words[$hash]['count']++;
  }

  return $words;
}