使用PHP脚本解析整个文件系统以及docx,xlsx,rtf等文件,打开每个文件,然后搜索每个文件以获取正则表达式。但这显然(因为文件系统很大)会阻塞浏览器,通常最终没有响应。
除了缩小要解析的文件数量之外,优化文件的方法有哪些,但却没有增加用户所需的任何工作量?
以下是完整的脚本:
<?php
ini_set('memory_limit', '-1'); // to take off fatal error of max memory exceeded.
set_time_limit(0); // to take off this fatal error: Maximum execution time of 30 seconds exceeded in /Users/jMac/Desktop/SORTED/SITES/test-Mar2013/VoiceMemos/populate-entire-disk.php on line 18 (now line 19)
error_reporting(E_ALL);
ini_set('display_errors', 1);
require_once $_SERVER['DOCUMENT_ROOT'] . '/mysql.inc.php';
require_once $_SERVER['DOCUMENT_ROOT'] . '/read_excel_function.php';
require_once $_SERVER['DOCUMENT_ROOT'] . '/ReadWord.php';
require_once $_SERVER['DOCUMENT_ROOT'] . '/rtf_library/index.php';
$filenames = array();
/*
$filenames[] = "/Users/jMac-NEW/.bash_history";
$filenames[] = "/Users/jMac-NEW/.bvceuxan";
$filenames[] = "/Users/jMac-NEW/.CFUserTextEncoding";
$filenames[] = "/Users/jMac-NEW/.collab";
$filenames[] = "/Users/jMac-NEW/.DS_Store";
$filenames[] = "/Users/jMac-NEW/.filezilla";
$filenames[] = "/Users/jMac-NEW/.oxaqjmnrc";
$filenames[] = "/Users/jMac-NEW/.Trash";
$filenames[] = "/Users/jMac-NEW/00008.MTS";
$filenames[] = "/Users/jMac-NEW/00009.MTS";
$filenames[] = "/Users/jMac-NEW/00010.MTS";
$filenames[] = "/Users/jMac-NEW/00011.MTS";
$filenames[] = "/Users/jMac-NEW/00012.MTS";
$filenames[] = "/Users/jMac-NEW/00013.MTS";
$filenames[] = "/Users/jMac-NEW/00014.MTS";
$filenames[] = "/Users/jMac-NEW/366_Contemporary Political Philosophy An introduction.djvu";
$filenames[] = "/Users/jMac-NEW/626_Principles of Medical Imaging";
$filenames[] = "/Users/jMac-NEW/647";
$filenames[] = "/Users/jMac-NEW/Applications";
$filenames[] = "/Users/jMac-NEW/BANKING STATEMENTS";
$filenames[] = "/Users/jMac-NEW/Blackboard";
$filenames[] = "/Users/jMac-NEW/Books";
$filenames[] = "/Users/jMac-NEW/Clip #23.mov";
$filenames[] = "/Users/jMac-NEW/dates_from_content (1).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content (2).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content (3).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content (4).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content(5).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content(6).csv";
$filenames[] = "/Users/jMac-NEW/dates_from_content.csv";
*/
$filenames[] = "/Users/jMac-NEW/Desktop";
/*
$filenames[] = "/Users/jMac-NEW/Documents";
$filenames[] = "/Users/jMac-NEW/Downloads";
$filenames[] = "/Users/jMac-NEW/for education policy analysis on HASS in S'pore";
$filenames[] = "/Users/jMac-NEW/Library";
$filenames[] = "/Users/jMac-NEW/Movies";
$filenames[] = "/Users/jMac-NEW/Music";
$filenames[] = "/Users/jMac-NEW/Pictures";
$filenames[] = "/Users/jMac-NEW/Public";
$filenames[] = "/Users/jMac-NEW/RES ppts";
$filenames[] = "/Users/jMac-NEW/TEMP for BALI TRIP 2013 - after can delete";
$filenames[] = "/Users/jMac-NEW/untitled folder";
$filenames[] = "/Users/jMac-NEW/Word_trial";
$filenames[] = "/Users/jMac-NEW/Word_trial.zip";
$filenames[] = "/Users/jMac-NEW/zzDEPRECATED";
$filenames[] = "/Users/jMac-NEW/zzITEMS ORIGINALLY IN ROOT";
$filenames[] = "/Users/jMac-NEW/zzTRANSFER to SCREEN CAPTURE 13";
$filenames[] = "/Users/jMac-NEW/zzzIMPORT"; */
// note that you cannot use directories but only actual files for $startFile and $endFile
$startFile = "/Users/jMac-NEW/Desktop/Thx Craig.docx";
$endFile = "/Users/jMac-NEW/Desktop/Transaction status | EasyPay.pdf";
/*
$startFile = "/Users/jMac-NEW/Downloads/Theater_10_Paper_1_2014.docx";
$endFile = "/Users/jMac-NEW/Downloads/Workbook7.xlsx";
*/
//$startFile = "/Users/jMac-NEW/Downloads/LOST ITEM notice/new-1.docx";
//$endFile = "/Users/jMac-NEW/Downloads/zzTSI PROPOSALS and GUIDES/136/The Validity of.pdf";
$exceptionFiles = array();
$exceptionFiles[] = "/Users/jMac-NEW/Desktop/HISTORY";
$exceptionFiles[] = "/Users/jMac-NEW/Desktop/HISTORY - research on HISTORY";
$exceptionFiles[] = "/Users/jMac-NEW/Desktop/IT MANUALS/from WD harddrive My Passport Ultra";
$exceptionFiles[] = "/Users/jMac-NEW/Library";
$exceptionFiles[] = "/Users/jMac-NEW/TEMP for BALI TRIP 2013 - after can delete";
$exceptionFiles[] = "/Users/jMac-NEW/WHATSAPP";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/COMPILATION - dates from content (version 1) (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/FINAL COMPILATION - latest under fmp.fmp12";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/humansclinic Google History.xls";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/humansclinic Google History2 (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/humansclinic Google History2.xls";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1) (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/SOURCING JOBS Breakdown (version 1) (version 1) (version 1) (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/TRAVERSE FOR DATES";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/Workbook9 (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/zQUESTION DATABASE - latest latest";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/zQUESTION DATABASE - latest latest latest";
$exceptionFiles[] = "/Users/jMac-NEW/zzDEPRECATED/zQUESTION DATABASE - latest latest latest (version 1)";
$exceptionFiles[] = "/Users/jMac-NEW/zzPDF";
$exceptionFiles[] = "/Users/jMac-NEW/zzPDF\history.xlsx";
$exceptionFiles[] = "/Users/jMac-NEW/zzTRANSFER to SCREEN CAPTURE 13";
$exceptionFiles[] = "/Users/jMac-NEW/Downloads/WD";
$exceptionFiles[] = "/Users/jMac-NEW/Downloads/TYPING";
$tooLargeFiles = array();
d($filenames, $startFile, $endFile, $exceptionFiles);
$problemFiles = array();
$counter = 0;
//testing only
$files = array();
foreach ($filenames as $filename) {
d($filename);
if (!is_dir($filename)) continue;
$it = new RecursiveDirectoryIterator($filename, RecursiveDirectoryIterator::SKIP_DOTS);
d($it);
foreach (new RecursiveIteratorIterator($it) as $file) {
// (0) get path
$files[] = $file->getPathname;
d($files);
// (1) if the file is one of the $exceptionFiles or a file of the directory in $exceptionFiles, get out of the current iteration of foreach loop ie stop here
foreach ($exceptionFiles as $exceptionFile) {
d(strpos($file->getPathname(), $exceptionFile)!==FALSE);
if (strpos($file->getPathname(), $exceptionFile)!==FALSE) {
d($file->getPathname(), strpos($file->getPathname(), $exceptionFile)!==FALSE);
// if $exceptionFiles WAS found as a substring of $file->getPathname()
continue 2;
}
}
// (2.1) check if path occurs on or after the $startFile pointer
if ($startFile) { // if there is a $startFile defined
if ($startFile === $file->getPathname()) {
$counter = 1;
// otherwise, $counter will still be 0 if it is before $startFile, and will be 2 if it is on or after $endFile
}
}
else {
echo "No start file defined.";
exit;
}
d($counter);
// (2.2) if path occurs before $startFile pointer, get out of the current iteration of foreach loop ie stop here
if ($counter !== 1) continue;
// (3.1) obtain file size
try {
$fileSize = $file->getSize();
}
catch (Exception $e) {
echo 'Caught exception: ', $e->getMessage(), "\n";
}
d($file->getPathname(), $fileSize);
// (3.2) if file size is too big, skip also
if ($fileSize>36000000) {
$tooLargeFiles[] = $file->getPathname();
d($tooLargeFiles);
continue;
}
// (4) after all the filtering from (2) and (3), finally - parse thru files for dates
d($counter == 1);
if (endsWith($file, ".xls") || endsWith($file, ".xlsx") || endsWith($file, ".doc") || endsWith($file, ".docx") || endsWith($file, ".txt") || endsWith($file, ".rtf") || endsWith($file, ".csv") || endsWith($file, ".tsv")) {
if (endsWith($file, ".xls")) {
$subject = initialise_excel_reader($file);
preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
foreach ($regs[0] as $match) {
dbQueryWithExt($match, $file, $filename, ".xls");
echo($match ) . "|";
$check = '1';
}
if ($check == '1') {
echo($file) . "<br>";
}
}
if (endsWith($file, ".xlsx")) {
$subject = initialise_excel_reader($file);
if($subject === null) {
continue;
}
preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
$check = '0';
foreach ($regs[0] as $match) {
dbQueryWithExt($match, $file, $filename, ".xlsx");
echo($match ) . "|";
$check = '1';
}
if ($check == '1') {
echo($file) . "<br>";
}
}
if (endsWith($file, ".doc")) {
$subject = read_file_doc($file);
preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);$check = '0';
foreach ($regs[0] as $match) {
dbQueryWithExt($match, $file, $filename, ".doc");
echo($match ) . "|";
$check = '1';
}
if ($check == '1') {
echo($file) . "<br>";
}
}
if (endsWith($file, ".docx")) {
$subject = read_file_docx($file);
preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);$check = '0';
foreach ($regs[0] as $match) {
dbQueryWithExt($match, $file, $filename, ".docx");
echo($match ) . "|";
$check = '1';
}
if ($check == '1') {
echo($file) . "<br>";
}
}
if (endsWith($file, ".txt") || endsWith($file, ".csv") || endsWith($file, ".tsv")){
d($file->getPathname());
$subject = file_get_contents($file);
d($subject);
preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
foreach ($regs[0] as $match) {
dbQueryWithExt($match, $file, $filename, ".txt | .csv | .tsv");
echo($match ) . "|";
$check = '1';
}
if ($check == '1') {
echo($file) . "<br>";
}
}
if (endsWith($file, ".rtf")) {
$subject = rtf2text($file);
if($subject === null) {
continue;
}
preg_match_all('/\d{1,2}\W?[a-z\,]{3,9}\W?\d{2,4}/i', $subject, $regs);
$check = '0';
foreach ($regs[0] as $match) {
dbQueryWithExt($match, $file, $filename, ".rtf");
echo($match ) . "|";
$check = '1';
}
if ($check == '1') {
echo($file) . "<br>";
}
}
}
// (5) change $counter to 2 if the current file is $endFile.
d($endFile === $file->getPathname());
if ($endFile === $file->getPathname()) {
$counter = 2;
}
}
}
// testing only
// d($files);
function endsWith($string, $test) {
$strlen = strlen($string);
$testlen = strlen($test);
if ($testlen > $strlen)
return false;
return substr_compare($string, $test, -$testlen) === 0;
}
function dbQueryWithExt ($match, $file, $filename, $ext) {
global $dbc;
$match = mysqli_real_escape_string($dbc, $match);
$file = mysqli_real_escape_string($dbc, $file);
$filename = mysqli_real_escape_string($dbc, $filename);
$ext = mysqli_real_escape_string($dbc, $ext);
$QueryString = "INSERT INTO dates_from_content (date, path, search_in, search_exists) VALUES ";
$QueryString .= "('$match', '$file', '$filename', '$ext'),";
$QueryString = substr($QueryString, 0, -1);
d($QueryString);
dbQuery($QueryString);
}
/* extend the function so that can add more extensions
function compareEndsWith($file, $extensions)
if (!is_array($extensions)) return false;
*/
?>
一些说明: (1)函数d()实际上是使用Kint库的var_dump(),因此它格式化输出。我只是用它来进行调试。
答案 0 :(得分:0)
要考虑的选项: