Sphider索引(pdf,xls,doc,...)mysql

时间:2016-01-18 09:24:51

标签: php mysql pdf special-characters

即时通讯使用sphider,在共享网络驱动器上创建某种全文搜索,即将完成,但我对此感到困惑。我索引txt,pdf,xls等文件内容,但我注意到,在mysql DB中我存储了一些名为 lambda_ [number] 的字符或者与其他单词的某些组合,但总是它的 lambda 字。这听起来像是一些不可打印的角色,或者对我来说有点咧嘴笑。

所以我决定通过utf8_encode(preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $fileOutput));消除这种痛苦,但没有成功。我的DB 排序规则是utf8_general_ci 。任何想法如何逃脱这个小巨魔?

和整个文件解析器代码,以便更好地理解我在做什么和所有解析功能,但问题可能是转义。

    function readTextFile($filePath) {
    $fileContent = file_get_contents($filePath);

    return $fileContent;
}

// -------------- START PARSE MS OFFICE FILES BLOCK 

function parsePPT($filename) {
// This approach uses detection of the string "chr(0f).Hex_value.chr(0x00).chr(0x00).chr(0x00)" to find text strings, which are then terminated by another NUL chr(0x00). [1] Get text between delimiters [2]
    $fileHandle = fopen($filename, "r");
    $line = @fread($fileHandle, filesize($filename));
    $lines = explode(chr(0x0f), $line);
    $outtext = '';

    foreach ($lines as $thisline) {
        if (strpos($thisline, chr(0x00) . chr(0x00) . chr(0x00)) == 1) {
            $text_line = substr($thisline, 4);
            $end_pos = strpos($text_line, chr(0x00));
            $text_line = substr($text_line, 0, $end_pos);
            $text_line = preg_replace("/[^a-zA-Z0-9\s\,\.\-\n\r\t@\/\_\(\)]/", "", $text_line);
            if (strlen($text_line) > 1) {
                $outtext.= substr($text_line, 0, $end_pos) . "\n";
            }
        }
    }
    return $outtext;
}

function pptx2text($filename) {
    $zip = new ZipArchive;

    // Open received archive file
    if (true === $zip->open($filename)) {

        // If done, search for the data file in the archive
        $dia = 1;
        $data = array();
        $output = "";

        while (($index = $zip->locateName("ppt/slides/slide$dia.xml") ) !== false) {
            $data[$dia] = $zip->getFromIndex($index);
            $xml = str_replace("</a:t>", " </a:t>", $data[$dia]);
            $output.=$xml;
            $dia++;
        }

        $zip->close();
        return strip_tags($output);
    } else {
        return "";
    }
}

function xlsx2text($filename) {
    $zip = new ZipArchive;
    // Open received archive file
    if (true === $zip->open($filename)) {

        // If done, search for the data file in the archive
        $dia = 1;
        $data = array();
        $output = "";

        while (($index = $zip->locateName("xl/worksheets/sheet$dia.xml") ) !== false) {
            $data[$dia] = $zip->getFromIndex($index);
            //$pageContent .= $data[$dia];
            $xml = str_replace("</a:t>", " </a:t>", $data[$dia]);
            $output.=$xml;
            $dia++;
        }

        $zip->close();
        return strip_tags($output);
    } else {
        return "";
    }
}

function docx2text($filename) {
    return readZippedXML($filename, "word/document.xml");
}

function readZippedXML($archiveFile, $dataFile) {
    // Create new ZIP archive  
    $zip = new ZipArchive;

    // Open received archive file  
    if (true === $zip->open($archiveFile)) {
        // If done, search for the data file in the archive  
        if (($index = $zip->locateName($dataFile)) !== false) {
            // If found, read it to the string  
            $data = $zip->getFromIndex($index);
            $data = str_replace("></", "> </", $data);
            // Close archive file
            $zip->close();
            // Load XML from a string  
            // Skip errors and warnings  
            $xml = new DOMDocument();
            $xml->loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            //$xml = DOMDocument::loadXML($data, LIBXML_NOENT | LIBXML_XINCLUDE | LIBXML_NOERROR | LIBXML_NOWARNING);
            // Return data without XML formatting tags
            return strip_tags($xml->saveXML());
            return $xml;
        }
        $zip->close();
    }
    // In case of failure return empty string  
    return "";
}



function parsePDF($fileName){
    require('tools/pdf2text.php');
    $pdfClass = new PDF2Text();
    $pdfClass->setFilename($fileName);
    $pdfClass->decodePDF();
    return $pdfClass->output();
}

// -------------- END PARSE MS OFFICE FILES BLOCK 


$fileType = filter_input(INPUT_GET, 'fileType');
$filePath = filter_input(INPUT_GET, 'filePath');

$serverUri = $_SERVER['REQUEST_URI'];
$_SERVER['REQUEST_URI'] = "testval";
$secondUri = $_SERVER['REQUEST_URI'];
$fileTitle = trim(str_replace("\\", " ",$filePath));

$fileOutput = "<html><head><title>".$fileTitle."</title></head><body>";

switch ($fileType) {
    case 'txt':
        $fileOutput .= readTextFile($filePath);
        break;
    case 'pptx':
        $fileOutput .= pptx2text($filePath);
        break;
    case 'docx':
        $fileOutput .= docx2text($filePath);
        break;
    case 'xlsx':
        $fileOutput .= xlsx2text($filePath);
        break;
    case 'ppt':
        $fileOutput .= parsePPT($filePath);
        break;
    case 'pdf':
        $fileOutput .= parsePDF($filePath);
        break;
    default:
        return false;
}
$fileOutput .= "</body>";
echo utf8_encode(preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $fileOutput));

0 个答案:

没有答案