优化解析文件系统的脚本

时间:2014-08-05 14:16:37

标签: php optimization

使用PHP脚本解析整个文件系统以及docx,xl​​sx,rtf等文件,打开每个文件,然后搜索每个文件以获取正则表达式。但这显然(因为文件系统很大)会阻塞浏览器,通常最终没有响应。

除了缩小要解析的文件数量之外,优化文件的方法有哪些,但却没有增加用户所需的任何工作量?

以下是完整的脚本:

<?php
ini_set('memory_limit', '-1'); // to take off fatal error of max memory exceeded.
set_time_limit(0); // to take off this fatal error: Maximum execution time of 30 seconds exceeded in /Users/jMac/Desktop/SORTED/SITES/test-Mar2013/VoiceMemos/populate-entire-disk.php on line 18 (now line 19)
error_reporting(E_ALL);
ini_set('display_errors', 1);

require_once $_SERVER['DOCUMENT_ROOT'] . '/mysql.inc.php';
require_once $_SERVER['DOCUMENT_ROOT'] . '/read_excel_function.php';
require_once $_SERVER['DOCUMENT_ROOT'] . '/ReadWord.php';
require_once $_SERVER['DOCUMENT_ROOT'] . '/rtf_library/index.php';

$filenames = array();
/*
$filenames[] = "/Users/jMac-NEW/.bash_history";
$filenames[] = "/Users/jMac-NEW/.bvceuxan";
$filenames[] = "/Users/jMac-NEW/.CFUserTextEncoding";
$filenames[] = "/Users/jMac-NEW/.collab";
$filenames[] = "/Users/jMac-NEW/.DS_Store";
$filenames[] = "/Users/jMac-NEW/.filezilla";
$filenames[] = "/Users/jMac-NEW/.oxaqjmnrc";
$filenames[] = "/Users/jMac-NEW/.Trash";
$filenames[] = "/Users/jMac-NEW/00008.MTS";
$filenames[] = "/Users/jMac-NEW/00009.MTS";
$filenames[] = "/Users/jMac-NEW/00010.MTS";
$filenames[] = "/Users/jMac-NEW/00011.MTS";
$filenames[] = "/Users/jMac-NEW/00012.MTS";
$filenames[] = "/Users/jMac-NEW/00013.MTS";
$filenames[] = "/Users/jMac-NEW/00014.MTS";
$filenames[] = "/Users/jMac-NEW/366_Contemporary Political Philosophy An introduction.djvu";
$filenames[] = "/Users/jMac-NEW/626_Principles of Medical Imaging";
$filenames[] = "/Users/jMac-NEW/647";
$filenames[] = "/Users/jMac-NEW/Applications";
$filenames[] = "/Users/jMac-NEW/BANKING STATEMENTS";
$filenames[] = "/Users/jMac-NEW/Blackboard";
$filenames[] = "/Users/jMac-NEW/Books";
$filenames[] = "/Users/jMac-NEW/Clip #23.mov";
$filenames[] = "/Users/jMac-NEW/dates_from_content (1).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content (2).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content (3).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content (4).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content(5).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content(6).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content.csv";
*/
$filenames[] = "/Users/jMac-NEW/Desktop";
/*
$filenames[] = "/Users/jMac-NEW/Documents";
$filenames[] = "/Users/jMac-NEW/Downloads";
$filenames[] = "/Users/jMac-NEW/for education policy analysis on HASS in S'pore";
$filenames[] = "/Users/jMac-NEW/Library";
$filenames[] = "/Users/jMac-NEW/Movies";
$filenames[] = "/Users/jMac-NEW/Music";
$filenames[] = "/Users/jMac-NEW/Pictures";
$filenames[] = "/Users/jMac-NEW/Public";
$filenames[] = "/Users/jMac-NEW/RES ppts";
$filenames[] = "/Users/jMac-NEW/TEMP for BALI TRIP 2013 - after can delete";
$filenames[] = "/Users/jMac-NEW/untitled folder";
$filenames[] = "/Users/jMac-NEW/Word_trial";
$filenames[] = "/Users/jMac-NEW/Word_trial.zip";
$filenames[] = "/Users/jMac-NEW/zzDEPRECATED";
$filenames[] = "/Users/jMac-NEW/zzITEMS ORIGINALLY IN ROOT";
$filenames[] = "/Users/jMac-NEW/zzTRANSFER to SCREEN CAPTURE 13";
$filenames[] = "/Users/jMac-NEW/zzzIMPORT"; */

// note that you cannot use directories but only actual files for $startFile and $endFile
$startFile = "/Users/jMac-NEW/Desktop/Thx Craig.docx";
$endFile = "/Users/jMac-NEW/Desktop/Transaction status | EasyPay.pdf";
/*
$startFile = "/Users/jMac-NEW/Downloads/Theater_10_Paper_1_2014.docx";
$endFile = "/Users/jMac-NEW/Downloads/Workbook7.xlsx";
*/
//$startFile = "/Users/jMac-NEW/Downloads/LOST ITEM notice/new-1.docx";
//$endFile = "/Users/jMac-NEW/Downloads/zzTSI PROPOSALS and GUIDES/136/The Validity of.pdf";
$exceptionFiles = array();
$exceptionFiles[] = "/Users/jMac-NEW/Desktop/HISTORY";
$exceptionFiles[] = "/Users/jMac-NEW/Desktop/HISTORY - research on HISTORY";
$exceptionFiles[] = "/Users/jMac-NEW/Desktop/IT MANUALS/from WD harddrive My Passport Ultra";
$exceptionFiles[] = "/Users/jMac-NEW/Library";
$exceptionFiles[] = "/Users/jMac-NEW/TEMP for BALI TRIP 2013 - after can delete";
$exceptionFiles[] = "/Users/jMac-NEW/WHATSAPP";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content (version 1) (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/FINAL COMPILATION - latest under fmp.fmp12";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/humansclinic Google History.xls";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/humansclinic Google History2 (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/humansclinic Google History2.xls";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1) (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1) (version 1) (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/TRAVERSE FOR DATES";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/Workbook9 (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/zQUESTION DATABASE - latest latest";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/zQUESTION DATABASE - latest latest latest";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/zQUESTION DATABASE - latest latest latest (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzPDF";
$exceptionFiles[] = "/Users/jMac-NEW/zzPDF\history.xlsx";
$exceptionFiles[] = "/Users/jMac-NEW/zzTRANSFER to SCREEN CAPTURE 13";
$exceptionFiles[] = "/Users/jMac-NEW/Downloads/WD";
$exceptionFiles[] = "/Users/jMac-NEW/Downloads/TYPING";
$tooLargeFiles = array();
d($filenames, $startFile, $endFile, $exceptionFiles);
$problemFiles = array();
$counter = 0;

//testing only
$files = array();

foreach ($filenames as $filename) {
    d($filename);
    if (!is_dir($filename)) continue;
    $it = new RecursiveDirectoryIterator($filename, RecursiveDirectoryIterator::SKIP_DOTS);
    d($it);
    foreach (new RecursiveIteratorIterator($it) as $file) {
        // (0) get path
        $files[] = $file->getPathname;
        d($files);
        // (1) if the file is one of the $exceptionFiles or a file of the directory in $exceptionFiles, get out of the current iteration of foreach loop ie stop here
        foreach ($exceptionFiles as $exceptionFile) {
            d(strpos($file->getPathname(), $exceptionFile)!==FALSE);
            if (strpos($file->getPathname(), $exceptionFile)!==FALSE) {
            d($file->getPathname(), strpos($file->getPathname(), $exceptionFile)!==FALSE);
            // if $exceptionFiles WAS found as a substring of $file->getPathname()
            continue 2; 
            }
        }   
        // (2.1) check if path occurs on or after the $startFile pointer
        if ($startFile) { // if there is a $startFile defined
            if ($startFile === $file->getPathname()) {
                $counter = 1;
                // otherwise, $counter will still be 0 if it is before $startFile, and will be 2 if it is on or after $endFile
            }
        }
        else {
            echo "No start file defined.";
            exit;   
        }
        d($counter);
        // (2.2) if path occurs before $startFile pointer, get out of the current iteration of foreach loop ie stop here
        if ($counter !== 1) continue;
        // (3.1) obtain file size
        try {
            $fileSize = $file->getSize();   
        }
        catch (Exception $e) {
            echo 'Caught exception: ',  $e->getMessage(), "\n";
        }
        d($file->getPathname(), $fileSize);
        // (3.2) if file size is too big, skip also
        if ($fileSize>36000000) { 
            $tooLargeFiles[] = $file->getPathname();
            d($tooLargeFiles);
            continue;
        }
        // (4) after all the filtering from (2) and (3), finally - parse thru files for dates
        d($counter == 1);
        if (endsWith($file, ".xls") || endsWith($file, ".xlsx") || endsWith($file, ".doc") || endsWith($file, ".docx") || endsWith($file, ".txt") || endsWith($file, ".rtf") || endsWith($file, ".csv") || endsWith($file, ".tsv")) {
                if (endsWith($file, ".xls")) {
                    $subject = initialise_excel_reader($file);
                    preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
                    foreach ($regs[0] as $match) {

                        dbQueryWithExt($match, $file, $filename, ".xls");


                        echo($match ) . "|";
                        $check = '1';
                    }
                    if ($check == '1') {
                        echo($file) . "<br>";
                    }
                }
                if (endsWith($file, ".xlsx")) {
                    $subject = initialise_excel_reader($file);
                    if($subject === null) {
                        continue;
                    }
                    preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
                    $check = '0';
                    foreach ($regs[0] as $match) {

                        dbQueryWithExt($match, $file, $filename, ".xlsx");
                        echo($match ) . "|";
                        $check = '1';
                    }
                    if ($check == '1') {
                        echo($file) . "<br>";
                    }
                }
                if (endsWith($file, ".doc")) {

                    $subject = read_file_doc($file);
                    preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);$check = '0';
                    foreach ($regs[0] as $match) {

                        dbQueryWithExt($match, $file, $filename, ".doc");

                        echo($match ) . "|";
                        $check = '1';
                    }
                    if ($check == '1') {
                        echo($file) . "<br>";
                    }
                }
                if (endsWith($file, ".docx")) {

                    $subject = read_file_docx($file);
                    preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);$check = '0';
                    foreach ($regs[0] as $match) {

                        dbQueryWithExt($match, $file, $filename, ".docx");
                        echo($match ) . "|";
                        $check = '1';
                    }
                    if ($check == '1') {
                        echo($file) . "<br>";
                    }
                }

                if (endsWith($file, ".txt") || endsWith($file, ".csv") || endsWith($file, ".tsv")){
                    d($file->getPathname());

                    $subject = file_get_contents($file);
                    d($subject);
                    preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
                    foreach ($regs[0] as $match) {

                        dbQueryWithExt($match, $file, $filename, ".txt | .csv | .tsv");


                        echo($match ) . "|";
                        $check = '1';
                    }
                    if ($check == '1') {
                        echo($file) . "<br>";
                    }
                }
                if (endsWith($file, ".rtf")) {

                    $subject = rtf2text($file);
                    if($subject === null) {
                        continue;
                    }
                    preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
                    $check = '0';
                    foreach ($regs[0] as $match) {

                        dbQueryWithExt($match, $file, $filename, ".rtf");
                        echo($match ) . "|";
                        $check = '1';
                    }
                    if ($check == '1') {
                        echo($file) . "<br>";
                    }
                }

            }
            // (5) change $counter to 2 if the current file is $endFile.
        d($endFile === $file->getPathname());
        if ($endFile === $file->getPathname()) {
            $counter = 2;    
            } 
        }

        }     


// testing only
// d($files);

function endsWith($string, $test) {
    $strlen = strlen($string);
    $testlen = strlen($test);
    if ($testlen > $strlen)
        return false;
    return substr_compare($string, $test, -$testlen) === 0;
}

function dbQueryWithExt ($match, $file, $filename, $ext) {

    global $dbc;
    $match = mysqli_real_escape_string($dbc, $match);
    $file = mysqli_real_escape_string($dbc, $file);
    $filename = mysqli_real_escape_string($dbc, $filename);
    $ext = mysqli_real_escape_string($dbc, $ext);

    $QueryString = "INSERT INTO dates_from_content (date, path, search_in, search_exists) VALUES ";
    $QueryString .= "('$match', '$file', '$filename', '$ext'),";
    $QueryString = substr($QueryString, 0, -1);
    d($QueryString);
    dbQuery($QueryString);

}


/* extend the function so that can add more extensions
  function compareEndsWith($file, $extensions)
  if (!is_array($extensions)) return false;


 */
?>

一些说明: (1)函数d()实际上是使用Kint库的var_dump(),因此它格式化输出。我只是用它来进行调试。

1 个答案:

答案 0 :(得分:0)

要考虑的选项:

  • 使用像/ dev / shm这样的memdrive来存储文件以便在其中搜索
  • 缓存&#39;文字结果&#39;从文件中,而不是每次使用libs解析它们,并使用filemtime函数来确定文件与缓存相比何时发生了更改,然后仅在文件发生更改时清除缓存)
  • 缓存&#39;准​​备好显示结果&#39;每个关键字与所有文件时间的总和相结合。 (缓存标记)
  • 当关键字中包含已缓存的关键字时,请使用缓存的关键字文件列表,以便大大缩小文件列表的范围。我认为这是最有用的提示。 (存储文件名,&#39;文本结果&#39;以及cacherow中的searchterm)。这需要一个类似sql(快速索引)的缓存库。