无法阅读线条中突出显示的确切文字

时间:2015-09-16 12:03:16

标签: java pdf pdfbox text-extraction

我正在使用PDBox阅读PDF文档中突出显示的内容。我能够以单行和多个单词的形式阅读突出显示的文本。但是,我无法阅读突出显示的文字。请查看以下示例代码以阅读突出显示的文本。

PDDocument pddDocument = PDDocument.load(new File("C:\\pdf-sample.pdf"));
List allPages = pddDocument.getDocumentCatalog().getAllPages();
        for (int i = 0; i < allPages.size(); i++) {
            int pageNum = i + 1;
            PDPage page = (PDPage) allPages.get(i);
            List<PDAnnotation> la = page.getAnnotations();
            if (la.size() < 1) {
                continue;
            }
            System.out.println("Page number : "+pageNum);
            for (PDAnnotation pdfAnnot: la) {
                if (pdfAnnot.getSubtype().equals("Popup")) {
                    continue;
                }

                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition(true);

                PDRectangle rect = pdfAnnot.getRectangle();
                float x = rect.getLowerLeftX() - 1;
                float y = rect.getUpperRightY() - 1;
                float width = rect.getWidth();
                float height = rect.getHeight() + rect.getHeight() / 4;

                int rotation = page.findRotation();
                if (rotation == 0) {
                    PDRectangle pageSize = page.getMediaBox();
                    y = pageSize.getHeight() - y;
                }

                Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height);
                stripper.addRegion(Integer.toString(0), awtRect);
                stripper.extractRegions(page);
System.out.println("------------------------------------------------------------------");
                System.out.println("Annot type = " + pdfAnnot.getSubtype());
                 System.out.println("Getting text from region = " + stripper.getTextForRegion(Integer.toString(0)) + "\n");
                 System.out.println("Getting text from comment = " + pdfAnnot.getContents());

            }
        }

PDDocument pddDocument = PDDocument.load(new File("C:\\pdf-sample.pdf")); List allPages = pddDocument.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { int pageNum = i + 1; PDPage page = (PDPage) allPages.get(i); List<PDAnnotation> la = page.getAnnotations(); if (la.size() < 1) { continue; } System.out.println("Page number : "+pageNum); for (PDAnnotation pdfAnnot: la) { if (pdfAnnot.getSubtype().equals("Popup")) { continue; } PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); PDRectangle rect = pdfAnnot.getRectangle(); float x = rect.getLowerLeftX() - 1; float y = rect.getUpperRightY() - 1; float width = rect.getWidth(); float height = rect.getHeight() + rect.getHeight() / 4; int rotation = page.findRotation(); if (rotation == 0) { PDRectangle pageSize = page.getMediaBox(); y = pageSize.getHeight() - y; } Rectangle2D.Float awtRect = new Rectangle2D.Float(x, y, width, height); stripper.addRegion(Integer.toString(0), awtRect); stripper.extractRegions(page); System.out.println("------------------------------------------------------------------"); System.out.println("Annot type = " + pdfAnnot.getSubtype()); System.out.println("Getting text from region = " + stripper.getTextForRegion(Integer.toString(0)) + "\n"); System.out.println("Getting text from comment = " + pdfAnnot.getContents()); } }

在读取行中突出显示的文本时,“pdfAnnot.getRectangle()”函数返回文本周围的最小矩形区域。这提供了比所需更多的文本。我找不到任何API来提取确切的突出显示文本。

例如: 从测试PDF文件中提取的文本。

  

任何地方的任何人都可以打开 PDF 文件。您只需要免费的 Adob​​e Acrobat

     

阅读器即可。其他文件格式的收件人有时无法打开文件,因为他们

     

没有用于创建文档的应用程序。

用例1:   阅读第一个粗体文本,即 PDF 。阅读单行中突出显示的文本没有问题。将打印正确的文本,如下所示:
 输出:     从region =“ PDF

获取文本

用例2:   阅读第二个粗体文本,即 Adob​​e Acrobat reader ,分为两行。在这种情况下,运行上述程序的提取文本是:
输出: 从region =“任何地方获取文本都可以打开PDF文件。您只需要免费的Adobe Acrobat 读者。其他文件格式的收件人有时无法打开文件,因为他们“。

getRectangle()API提供由突出显示的文本包围的最小矩形的坐标。因此,它比“Adobe Acrobat Reader”更多文本。

  1. 如何知道提取区域中突出显示的起点和终点。
  2. 如何知道提取区域中的行数。
  3. 任何帮助都将受到高度赞赏。

2 个答案:

答案 0 :(得分:1)

我设法使用以下代码提取突出显示的文本。

// PDF32000-2008
// 12.5.2 Annotation Dictionaries
// 12.5.6 Annotation Types
// 12.5.6.10 Text Markup Annotations
@SuppressWarnings({ "unchecked", "unused" })
public ArrayList<String> getHighlightedText(String filePath, int pageNumber) throws IOException {
    ArrayList<String> highlightedTexts = new ArrayList<>();
    // this is the in-memory representation of the PDF document.
    // this will load a document from a file.
    PDDocument document = PDDocument.load(filePath);
    // this represents all pages in a PDF document.
    List<PDPage> allPages =  document.getDocumentCatalog().getAllPages();
    // this represents a single page in a PDF document.
    PDPage page = allPages.get(pageNumber);
    // get  annotation dictionaries
    List<PDAnnotation> annotations = page.getAnnotations();

    for(int i=0; i<annotations.size(); i++) {
        // check subType 
        if(annotations.get(i).getSubtype().equals("Highlight")) {
            // extract highlighted text
            PDFTextStripperByArea stripperByArea = new PDFTextStripperByArea();

            COSArray quadsArray = (COSArray) annotations.get(i).getDictionary().getDictionaryObject(COSName.getPDFName("QuadPoints"));
            String str = null;

            for(int j=1, k=0; j<=(quadsArray.size()/8); j++) {

                COSFloat ULX = (COSFloat) quadsArray.get(0+k);
                COSFloat ULY = (COSFloat) quadsArray.get(1+k);
                COSFloat URX = (COSFloat) quadsArray.get(2+k);
                COSFloat URY = (COSFloat) quadsArray.get(3+k);
                COSFloat LLX = (COSFloat) quadsArray.get(4+k);
                COSFloat LLY = (COSFloat) quadsArray.get(5+k);
                COSFloat LRX = (COSFloat) quadsArray.get(6+k);
                COSFloat LRY = (COSFloat) quadsArray.get(7+k);

                k+=8;

                float ulx = ULX.floatValue() - 1;                           // upper left x.
                float uly = ULY.floatValue();                               // upper left y.
                float width = URX.floatValue() - LLX.floatValue();          // calculated by upperRightX - lowerLeftX.
                float height = URY.floatValue() - LLY.floatValue();         // calculated by upperRightY - lowerLeftY.

                PDRectangle pageSize = page.getMediaBox();
                uly = pageSize.getHeight() - uly;

                Rectangle2D.Float rectangle_2 = new Rectangle2D.Float(ulx, uly, width, height);
                stripperByArea.addRegion("highlightedRegion", rectangle_2);
                stripperByArea.extractRegions(page);
                String highlightedText = stripperByArea.getTextForRegion("highlightedRegion");

                if(j > 1) {
                    str = str.concat(highlightedText);
                } else {
                    str = highlightedText;
                }
            }
            highlightedTexts.add(str);
        }
    }
    document.close();

    return highlightedTexts;
}

答案 1 :(得分:0)

要使@roham-amini 提供的代码在当前版本的 Apache PDFBOX (2.0) 中工作,您必须进行大量更改。

这段代码运行良好,我在 Freeplane 的一个 groovy 脚本中使用了它。您可能需要更改 logger.info 函数。

@Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.22')
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.interactive.annotation.*;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import org.apache.pdfbox.pdmodel.common.*;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.cos.*




// PDDocument document = new PDDocument();
String pdfFilePath = 'temp.pdf'
PDDocument pdfDoc = PDDocument.load(new File(pdfFilePath));
ArrayList<String> highlightedTexts = new ArrayList<>();

int pageNum=0;
for( PDPage pdfpage : pdfDoc.getPages()-60 )
{
    pageNum++;
    List<PDAnnotation> annotations = pdfpage.getAnnotations();
    //first setup text extraction regions
    for( int i=0; i<annotations.size(); i++ )
    {
        PDAnnotation annot = annotations.get(i);
        annotNote = annot.getContents(); // Conteudo anotado na nota
        annotSubType = annot.getSubtype() // Tipo da nota (Highlight, Text)
        // annotTitle = annot.getTitlePopup(); // Autor da nota
        if( annotSubType.equals('Highlight') )
        {
        // extract highlighted text
            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            COSArray quadsArray = (COSArray) annot.getCOSObject().getCOSArray(COSName.getPDFName("QuadPoints"));
            String str = null;
            for(int j=1, k=0; j<=(quadsArray.size()/8); j++) {
                Float ULX = quadsArray.get(0+k).floatValue();
                Float ULY = quadsArray.get(1+k).floatValue();
                Float URX = quadsArray.get(2+k).floatValue();
                Float URY = quadsArray.get(3+k).floatValue();
                Float LLX = quadsArray.get(4+k).floatValue();
                Float LLY = quadsArray.get(5+k).floatValue();
                Float LRX = quadsArray.get(6+k).floatValue();
                Float LRY = quadsArray.get(7+k).floatValue();
                k+=8;
                float ulx = ULX - 1; // upper left x.
                float uly = ULY; // upper left y.
                float width = URX - LLX;          // calculated by upperRightX - lowerLeftX.
                float height = URY - LLY;         // calculated by upperRightY - lowerLeftY.

                PDRectangle pageSize = pdfpage.getMediaBox();
                uly = pageSize.getHeight() - uly;

                Rectangle2D.Float rectangle_2 = new Rectangle2D.Float(ulx, uly, width, height);
                stripper.addRegion("highlightedRegion", rectangle_2);
                stripper.extractRegions(pdfpage);
                String highlightedText = stripper.getTextForRegion("highlightedRegion").replaceAll("[\\n\\t ]", " ");

                if(j > 1) {
                    str = str.concat(highlightedText);
                } else {
                    str = highlightedText;
                }
            }
            highlightedTexts.add(str);
            logInfo = str;

            logMsg=">>>>>>>>>>Pagina: " + pageNum + ", Sessão: " + annotNote + ", Nota: " + annotNote + "Texto sublinhado: " + logInfo;
            logger.info(logMsg);
        }
    }

}
pdfDoc.close();