我已经设定了在非常有限的时间内在PHP中创建基本文本文件搜索引擎的挑战,几乎没有以前的编程知识这是一项非常艰巨的任务!
这是我们到目前为止所做的,它确实设法返回一个单词出现次数最多的文档((s) - 如果有多个相同数量)。
问题是我们做的方式不是(至少不容易)让我们计算TF-IDF得分。 IDF已经完成,但是我们需要通过获取返回文档中的单词总数来计算TF,这就是我们遇到的问题。另一个问题是它只返回最高的文件,而我们无法让它返回每个文件的清单......即一个文件中有3个单词的“航空公司”,另外两个文件有一次他们被遗忘了,只有第一个被归还......
(剥离符号也存在一些问题,但我们解决了这个问题,尽管采用了抽取方法......)
以下是我们的内容:
<?php
$starttime = microtime();
$startarray = explode(" ", $starttime);
$starttime = $startarray[1] + $startarray[0];
if(isset($_GET['search']))
{
$searchWord = $_GET['search'];
}
else
{
$searchWord = null;
}
?>
<html>
<link href="style.css" rel="stylesheet" type="text/css">
<body>
<div id="wrapper">
<div id="searchbar">
<h1>PHP Search</h1>
<form name='searchform' id='searchform' action='<?php echo $_SERVER['PHP_SELF']; ?>' method='get'>
<input type='text' name='search' id='search' value='<?php echo $_GET['search']; ?>' />
<input type='submit' value='Search' />
</form>
<br />
<br />
</div><!-- close searchbar -->
<?php
//path to directory to scan
$directory = "./files/";
//get all image files with a .txt extension.
$files = glob("" . $directory . "*.txt");
$fileList = array();
//print each file name
foreach($files as $file)
{
$fileList[] = $file;
}
//$fileList;
function indexFile($file){
$filename = $file;
$fp = fopen($filename, 'r');
$file_contents = fread($fp, filesize($filename));
fclose($fp);
$pat[0] = "/^\s+/";
$pat[1] = "/\s{2,}/";
$pat[2] = "/\s+\$/";
$rep[0] = "";
$rep[1] = " ";
$rep[2] = "";
$new_contents = preg_replace("/[^A-Za-z0-9\s\s+]/", "", $file_contents);
$new_contents = preg_replace($pat, $rep, $new_contents);
//COMMON WORDS WERE HERE
include "commonwords.php";
$lines = explode("\n", $new_contents);
$lines2 = implode(" ", $lines); //string
$lines2 = strtolower($lines2);
//echo $lines2 . "<br><br>";
$words = explode(" ", $lines2); //array
//$words = $lines;
$useful_words = array_diff($words, $commonWords);
$useful_words = array_values($useful_words);
print_r(count($useful_words));
//echo '<pre>';
$index = array_count_values($useful_words);
arsort($index, SORT_NUMERIC);
//print_r($index);
//echo '</pre>';
return $index;
}
// $file1 = indexFile ('airlines.txt'); //array
// $file2 = indexFile ('africa.txt'); //array
function merge_common_keys(){
$arr = func_get_args();
$num = func_num_args();
$keys = array();
$i = 0;
for($i=0;$i<$num;++$i){
$keys = array_merge($keys, array_keys($arr[$i]));
}
$keys = array_unique($keys);
$merged = array();
foreach($keys as $key){
$merged[$key] = array();
for($i=0;$i<$num;++$i){
$merged[$key][] = isset($arr[$i][$key])?$arr[$i][$key]:null;
}
}
return $merged;
}
for ($i = 0; $i < count($fileList); $i++) {
$fileArray[$i] = indexFile($fileList[$i]);
}
$merged = call_user_func_array('merge_common_keys',$fileArray);
$searchQ = $merged[$searchWord];
echo '<pre>';
print_r($searchQ);
echo '</pre>';
//echo "hello2";
$maxValue = 0;
$num_docs = 0;
$docID = array();
$n = count($searchQ);
for ($i=0 ; $i < $n ; $i++) {
if ($searchQ[$i] > $maxValue) {
$maxValue = $searchQ[$i];
unset($docID);
$docID[] = $i;
//print_r(count($fileArray[$i]));
}
else if($searchQ[$i] == $maxValue){
$docID[] = $i;
}
if (!empty($searchQ[$i])) {
$num_docs++;
}
}
print_r($n);
print_r($num_docs);
print_r($docID);
if(is_array($docID)){
for ($i = 0; $i < count($docID); $i++) {
if ($maxValue == 1){$plural = '';}else{$plural = 's';}
print_r ('<p><b>'.$searchWord . '</b> found in document <a href="'.$fileList[$docID[$i]].'">'.$fileList[$docID[$i]].'</a> '.$maxValue.' time'.$plural.'.</p>');
$TF = $maxValue;
//$TF2 = 1 + log($TF);
echo "<br>$TF2<br>";
$DF = $num_docs;
$Non = $n / $num_docs;
//echo "$Non";
$IDF = (float) log10($Non);
$TFxIDF = $TF2 * $IDF;
//echo "$TFxIDF";
}
}
//1,2
//file_put_contents("demo2.txt", implode(" ", $useful_words));
if(isset($_GET['search']))
{
$endtime = microtime();
$endarray = explode(" ", $endtime);
$endtime = $endarray[1] + $endarray[0];
$totaltime = $endtime - $starttime;
$totaltime = round($totaltime,5);
echo "<div id='timetaken'><p>This page loaded in $totaltime seconds.</p></div>";
}
?>
</div><!-- close wrapper -->
</body>
</html>