即时通讯使用sphider,在共享网络驱动器上创建某种全文搜索,即将完成,但我对此感到困惑。我索引txt,pdf,xls等文件内容,但我注意到,在mysql DB中我存储了一些名为 lambda_ [number] 的字符或者与其他单词的某些组合,但总是它的 lambda 字。这听起来像是一些不可打印的角色,或者对我来说有点咧嘴笑。
所以我决定通过utf8_encode(preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $fileOutput));
消除这种痛苦,但没有成功。我的DB 排序规则是utf8_general_ci 。任何想法如何逃脱这个小巨魔?
和整个文件解析器代码,以便更好地理解我在做什么和所有解析功能,但问题可能是转义。
function readTextFile($filePath) {
$fileContent = file_get_contents($filePath);
return $fileContent;
}
// -------------- START PARSE MS OFFICE FILES BLOCK
function parsePPT($filename) {
// This approach uses detection of the string "chr(0f).Hex_value.chr(0x00).chr(0x00).chr(0x00)" to find text strings, which are then terminated by another NUL chr(0x00). [1] Get text between delimiters [2]
$fileHandle = fopen($filename, "r");
$line = @fread($fileHandle, filesize($filename));
$lines = explode(chr(0x0f), $line);
$outtext = '';
foreach ($lines as $thisline) {
if (strpos($thisline, chr(0x00) . chr(0x00) . chr(0x00)) == 1) {
$text_line = substr($thisline, 4);
$end_pos = strpos($text_line, chr(0x00));
$text_line = substr($text_line, 0, $end_pos);
$text_line = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/", "", $text_line);
if (strlen($text_line) > 1) {
$outtext.= substr($text_line, 0, $end_pos) . "\n";
}
}
}
return $outtext;
}
function pptx2text($filename) {
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($filename)) {
// If done, search for the data file in the archive
$dia = 1;
$data = array();
$output = "";
while (($index = $zip->locateName("ppt/slides/slide$dia.xml") ) !== false) {
$data[$dia] = $zip->getFromIndex($index);
$xml = str_replace("</a:t>", " </a:t>", $data[$dia]);
$output.=$xml;
$dia++;
}
$zip->close();
return strip_tags($output);
} else {
return "";
}
}
function xlsx2text($filename) {
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($filename)) {
// If done, search for the data file in the archive
$dia = 1;
$data = array();
$output = "";
while (($index = $zip->locateName("xl/worksheets/sheet$dia.xml") ) !== false) {
$data[$dia] = $zip->getFromIndex($index);
//$pageContent .= $data[$dia];
$xml = str_replace("</a:t>", " </a:t>", $data[$dia]);
$output.=$xml;
$dia++;
}
$zip->close();
return strip_tags($output);
} else {
return "";
}
}
function docx2text($filename) {
return readZippedXML($filename, "word/document.xml");
}
function readZippedXML($archiveFile, $dataFile) {
// Create new ZIP archive
$zip = new ZipArchive;
// Open received archive file
if (true === $zip->open($archiveFile)) {
// If done, search for the data file in the archive
if (($index = $zip->locateName($dataFile)) !== false) {
// If found, read it to the string
$data = $zip->getFromIndex($index);
$data = str_replace("></", "> </", $data);
// Close archive file
$zip->close();
// Load XML from a string
// Skip errors and warnings
$xml = new DOMDocument();
$xml->loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
//$xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
// Return data without XML formatting tags
return strip_tags($xml->saveXML());
return $xml;
}
$zip->close();
}
// In case of failure return empty string
return "";
}
function parsePDF($fileName){
require('tools/pdf2text.php');
$pdfClass = new PDF2Text();
$pdfClass->setFilename($fileName);
$pdfClass->decodePDF();
return $pdfClass->output();
}
// -------------- END PARSE MS OFFICE FILES BLOCK
$fileType = filter_input(INPUT_GET, 'fileType');
$filePath = filter_input(INPUT_GET, 'filePath');
$serverUri = $_SERVER['REQUEST_URI'];
$_SERVER['REQUEST_URI'] = "testval";
$secondUri = $_SERVER['REQUEST_URI'];
$fileTitle = trim(str_replace("\\", " ",$filePath));
$fileOutput = "<html><head><title>".$fileTitle."</title></head><body>";
switch ($fileType) {
case 'txt':
$fileOutput .= readTextFile($filePath);
break;
case 'pptx':
$fileOutput .= pptx2text($filePath);
break;
case 'docx':
$fileOutput .= docx2text($filePath);
break;
case 'xlsx':
$fileOutput .= xlsx2text($filePath);
break;
case 'ppt':
$fileOutput .= parsePPT($filePath);
break;
case 'pdf':
$fileOutput .= parsePDF($filePath);
break;
default:
return false;
}
$fileOutput .= "</body>";
echo utf8_encode(preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $fileOutput));