我想做的是,我有一个pdf文件。其中包含注释和对注释的建议,这些注释显示在带注释的单词的鼠标悬停时。 例如。考虑上面的图片,其中您要花费的单词是删除线(表示不正确的单词),并且在鼠标悬停于其上方的情况下,会弹出一个弹出窗口,其中显示了正确的单词。类似地,还有另一个插入符号也是如此。
答案 0 :(得分:0)
可悲的是,PDF中的注释“树”并不是那么简单。 POC只是迭代注释,然后创建过滤器,然后由提取器组件使用。 Here是另一个演示,它提取了评论树,这可能是排序/逻辑性更高的结果的基础。
// load and register the autoload function
// create a document instance
$document = SetaPDF_Core_Document::loadByFilename('camtown/Terms-and-Conditions - revised.pdf');
// initate an extractor instance
$extractor = new SetaPDF_Extractor($document);
// get page documents pages object
$pages = $document->getCatalog()->getPages();
// we are going to save the extracted text in this variable
$results = [];
// map pages and filternames to annotation instances
$annotationsByPageAndFilterName = [];
// iterate over all pages
for ($pageNo = 1, $pageCount = $pages->count(); $pageNo <= $pageCount; $pageNo++) {
// get the page object
$page = $pages->getPage($pageNo);
// get the annotations
$annotations = array_filter($page->getAnnotations()->getAll(), function(SetaPDF_Core_Document_Page_Annotation $annotation) {
switch ($annotation->getType()) {
case SetaPDF_Core_Document_Page_Annotation::TYPE_HIGHLIGHT:
case SetaPDF_Core_Document_Page_Annotation::TYPE_STRIKE_OUT:
case SetaPDF_Core_Document_Page_Annotation::TYPE_CARET:
case SetaPDF_Core_Document_Page_Annotation::TYPE_UNDERLINE:
return true;
return false;
// create a strategy instance
$strategy = new SetaPDF_Extractor_Strategy_ExactPlain();
// create a multi filter instance
$filter = new SetaPDF_Extractor_Filter_Multi();
// and pass it to the strategy
// iterate over all highlight annotations
foreach ($annotations AS $tmpId => $annotation) {
* @var SetaPDF_Core_Document_Page_Annotation_Highlight $annotation
$name = 'P#' . $pageNo . '/HA#' . $tmpId;
if ($annotation->getName()) {
$name .= ' (' . $annotation->getName() . ')';
if ($annotation instanceof SetaPDF_Core_Document_Page_Annotation_TextMarkup) {
// iterate over the quad points to setup our filter instances
$quadpoints = $annotation->getQuadPoints();
for ($pos = 0, $c = count($quadpoints); $pos < $c; $pos += 8) {
$llx = min($quadpoints[$pos + 0], $quadpoints[$pos + 2], $quadpoints[$pos + 4], $quadpoints[$pos + 6]) - 1;
$urx = max($quadpoints[$pos + 0], $quadpoints[$pos + 2], $quadpoints[$pos + 4], $quadpoints[$pos + 6]) + 1;
$lly = min($quadpoints[$pos + 1], $quadpoints[$pos + 3], $quadpoints[$pos + 5], $quadpoints[$pos + 7]) - 1;
$ury = max($quadpoints[$pos + 1], $quadpoints[$pos + 3], $quadpoints[$pos + 5], $quadpoints[$pos + 7]) + 1;
// reduze it to a small line
$diff = ($ury - $lly) / 2;
$lly = $lly + $diff - 1;
$ury = $ury - $diff - 1;
// Add a new rectangle filter to the multi filter instance
new SetaPDF_Extractor_Filter_Rectangle(
new SetaPDF_Core_Geometry_Rectangle($llx, $lly, $urx, $ury),
$annotationsByPageAndFilterName[$pageNo][$name] = $annotation;
// if no filters for this page defined, ignore it
if (count($filter->getFilters()) === 0) {
// pass the strategy to the extractor instance
// and get the results by the current page number
$result = $extractor->getResultByPageNumber($pageNo);
if ($result === '')
$results[$pageNo] = $result;
// debug output
foreach ($annotationsByPageAndFilterName AS $pageNo => $annotations) {
echo '<h1>Page No #' . $pageNo . '</h1>';
echo '<table border="1"><tr><th>Name</th><th>Text</th><th>Subject</th><th>Comment</th></tr>';
foreach ($annotations AS $name => $annotation) {
echo '<tr>';
echo '<td>' . $name . '</td>';
echo '<td><pre>' . ($results[$pageNo][$name] ?? '') . '</pre></td>';
echo '<td><pre>' . $annotation->getSubject() . '</pre></td>';
echo '<td><pre>' . $annotation->getContents() . '</pre></td>';
echo '</tr>';
echo '</table>';
答案 1 :(得分:0)
答案 2 :(得分:-1)