我正在尝试使用PdfParser库从pdf文件中提取数据。
当我尝试使用几个大而复杂的pdf文件时,它给了我一个错误:
允许的内存大小为134217728字节
我需要一些永久的解决方案,要么是库已损坏,要么我的实现错误。
这是我的代码:
class crawlController extends Controller
{
public function crawler()
{
$dirPath = '/home/development/pdf_root';
$this->getFileFolderTree($dirPath);
}
public function getFileFolderTree($rootDirectory)
{
$this->goIntoFolder($rootDirectory);
}
public function goIntoFolder($dirPath)
{
// Get all the direct sub folders of the root folder
try
{
$dirList = File::directories($dirPath);
}
catch(\App\Exceptions\InvalidArgumentException $e)
{
require $e->getMessage();
}
if(count($dirList) == 0)
{
//search for files now
$this->searchFiles($dirPath);
}
else
{
// Loop through the list of diectories
foreach ($dirList as $dir)
{
// Print name of the selected directory
echo "Folder name : ",basename($dir)," Parent Folder :",basename($dirPath),"<br/>";
// Recursivly search selected directory
$this->goIntoFolder($dir);
}
echo "<hr><br/>";
}
}
public function searchFiles($dirPath)
{
// Read all files
$files = File::files($dirPath);
$result = FALSE;
// If no files exists
if(count($files) > 0)
{
foreach ($files as $file)
{
// Check if file is a pdf file.
if(0 == strcasecmp('pdf',File::extension($file)))
{
// Read the file
$this->readFileData($file);
}
}
$result = TRUE;
}
return $result;
}
public function readFileData($file)
{
// Build PdfTotext object
$parser = new \Smalot\PdfParser\Parser();
$pdfLoad = $parser->parseFile($file);
$content = $pdfLoad->getText();
$txtFilename = basename($file).".txt";
$bytesWritten = File::append($txtFilename,$content);
if($bytesWritten)
{
echo "success : ",$file;
}
else
{
echo "Faliure : ",$file;
}
unset($parser);
}
}