我曾经获得过PDF文件的元数据。我使用SMALOT pdf Parser和TCPDF Parser解析文件。
我使用smalot pdf parse解析pdf文件,然后使用TCPDF解析器库获取pdf文件的元数据和内容。它适用于小型pdf文件,但当我解析10 MB或大型pdf文件时,内存限制被挖出并执行停止并且没有出现任何错误。我设置了内存限制1024M。
public function parseFile($filename)
{
return $this->parseContent($filename);
}
public function parseContent($filename)
{
// Create structure using TCPDF Parser.
ob_start();
$parser = new \TCPDF_PARSER(file_get_contents($filename));
list($xref, $data) = $parser->getParsedData();
// print_r($tcpdf->getParsedData());
// $parser = new \TCPDF_PARSER(ltrim($content));
list($xref, $data) = $parser->getParsedData();
unset($parser);
ob_end_clean();
if (isset($xref['trailer']['encrypt']))
{
throw new \Exception('Secured pdf file are currently not supported.');
}
if (empty($data))
{
throw new \Exception('Object list not found. Possible secured file.');
}
// Create destination object.
$document = new Document();
$this->objects = array();
foreach ($data as $id => $structure)
{
$this->parseObject($id, $structure, $document);
unset($data[$id]);
}
$document->setTrailer($this->parseTrailer($xref['trailer'], $document));
$document->setObjects($this->objects);
return $document;
}