Question

我有一个PDF，我使用PDFBox从中提取了一个页面：

(...)
File input = new File("C:\\temp\\sample.pdf");
document = PDDocument.load(input);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage page = (PDPage) allPages.get(2);
PDStream contents = page.getContents();
if (contents != null) {
System.out.println(contents.getInputStreamAsString());
(...)

根据PDF spec，这给出了以下结果，看起来像您期望的那样。

q
/GS0 gs
/Fm0 Do
Q
/Span <</Lang (en-US)/MCID 88 >>BDC 
BT
/CS0 cs 0 0 0  scn
/GS1 gs
/T1_0 1 Tf
8.5 0 0 8.5 70.8661 576 Tm
(This page has been intentionally left blank.)Tj
ET
EMC 
1 1 1  scn
/GS0 gs
22.677 761.102 28.346 32.599 re
f
/Span <</Lang (en-US)/MCID 89 >>BDC 
BT
0.531 0.53 0.528  scn
/T1_1 1 Tf
9 0 0 9 45.7136 761.1024 Tm
(2)Tj
ET
EMC 
q
0 g
/Fm1 Do
Q

我正在寻找的是在页面上提取PDF TextObjects（如PDF规范的5.3中所述）作为java Objects，所以基本上是BT和ET之间的部分（本页的两个'en）。它们至少应该包含“Tj”之前的括号之间的所有字符串，以及基于“Tm”（或“Td”运算符等）的x和ycoördinate。其他属性是奖励，但不是必需的。

PDFTextStripper似乎给我带有属性的每个角色作为TextPosition（为我的目的太多噪音），或者所有Text作为一个长String。

PDFBox是否有一个功能可以解析一个页面并提供我错过的像这样的TextObjects？或者，如果我要扩展PDFBox以获得我需要的东西，我应该从哪里开始？欢迎任何帮助。

编辑：找到了另一个问题here，它为我如何构建我需要的东西提供了灵感。如果我成功了，我会回来看看。尽管如此，仍然期待着您的任何帮助。

谢谢，

菲尔

Answer 1

根据链接的问题和mkl昨天的提示（谢谢！），我决定构建一些东西来解析令牌。要考虑的是在PDF文本对象中，属性在操作符之前，因此我收集集合中的所有属性，直到遇到操作符。然后，当我知道属性属于哪个运算符时，我将它们移动到适当的位置。这就是我提出的：

import java.io.File;
import java.util.List;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFOperator;

public class TextExtractor {
    public static void main(String[] args) { 
        try {
            File input = new File("C:\\some\\file.pdf");
            PDDocument document = PDDocument.load(input);
            List allPages = document.getDocumentCatalog().getAllPages();
            // just parsing page 2 here, as it's only a sample
            PDPage page = (PDPage) allPages.get(2);
            PDStream contents = page.getContents();
            PDFStreamParser parser = new PDFStreamParser(contents.getStream());
            parser.parse();  
            List tokens = parser.getTokens();  
            boolean parsingTextObject = false; //boolean to check whether the token being parsed is part of a TextObject
            PDFTextObject textobj = new PDFTextObject();
            for (int i = 0; i < tokens.size(); i++)  
            {  
                Object next = tokens.get(i); 
                if (next instanceof PDFOperator)  {
                    PDFOperator op = (PDFOperator) next;  
                    switch(op.getOperation()){
                        case "BT":
                            //BT: Begin Text. 
                            parsingTextObject = true;
                            textobj = new PDFTextObject();
                            break;
                        case "ET":
                            parsingTextObject = false;
                            System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
                            break;
                        case "Tj":
                            textobj.setText();
                            break;
                        case "Tm":
                            textobj.setMatrix();
                            break;
                        default:
                            //System.out.println("unsupported operation " + op.getOperation());
                    }
                    textobj.clearAllAttributes();
                }
                else if (parsingTextObject)  {
                    textobj.addAttribute(next);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        } 
    }
}

结合：

import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;

class PDFTextObject{
    private List attributes = new ArrayList<Object>();
    private String text = "";
    private float x = -1;
    private float y = -1;

    public void clearAllAttributes(){
        attributes = new ArrayList<Object>();
    }

    public void addAttribute(Object anAttribute){
        attributes.add(anAttribute);
    }

    public void setText(){
        //Move the contents of the attributes to the text attribute.
        for (int i = 0; i < attributes.size(); i++){
            if (attributes.get(i) instanceof COSString){
                COSString aString = (COSString) attributes.get(i);
                text = text + aString.getString();
            }
            else {
                System.out.println("Whoops! Wrong type of property...");
            }
        }
    }

    public String getText(){
        return text;
    }

    public void setMatrix(){
        //Move the contents of the attributes to the x and y attributes.
        //A Matrix has 6 attributes, the last two of which are x and y
        for (int i = 4; i < attributes.size(); i++){
            float curval = -1;
            if (attributes.get(i) instanceof COSInteger){
                COSInteger aCOSInteger = (COSInteger) attributes.get(i); 
                curval = aCOSInteger.floatValue();

            }
            if (attributes.get(i) instanceof COSFloat){
                COSFloat aCOSFloat = (COSFloat) attributes.get(i);
                curval = aCOSFloat.floatValue();
            }
            switch(i) {
                case 4:
                    x = curval;
                    break;
                case 5:
                    y = curval;
                    break;
            }
        }
    }

    public float getX(){
        return x;
    }

    public float getY(){
        return y;
    }
}

它给出了输出：

Text: This page has been intentionally left blank.@70.8661,576.0
Text: 2@45.7136,761.1024

虽然它可以解决问题，但我确信我已经破坏了一些惯例并且总是编写最优雅的代码。欢迎改进和替代解决方案。

Answer 2

我在pdfbox-2.0.1

中添加了Phil响应版本

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;

public class TextExtractor {
  public static void main(String[] args) {
    try {
      File input = new File("src\\test\\resources\\files\\file1.pdf");
      PDDocument document = PDDocument.load(input);
      PDPageTree allPages = document.getDocumentCatalog().getPages();
      // just parsing page 2 here, as it's only a sample
      PDPage page = allPages.get(0);
      PDFStreamParser parser = new PDFStreamParser(page);
      parser.parse();
      List tokens = parser.getTokens();
      boolean parsingTextObject = false; // boolean to check whether the token
                                         // being parsed is part of a TextObject
      PDFTextObject textobj = new PDFTextObject();
      for (int i = 0; i < tokens.size(); i++) {
        Object next = tokens.get(i);
        if (next instanceof Operator) {
          Operator op = (Operator) next;
          switch (op.getName()) {
          case "BT":
            // BT: Begin Text.
            parsingTextObject = true;
            textobj = new PDFTextObject();
            break;
          case "ET":
            parsingTextObject = false;
            System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
            break;
          case "Tj":
            textobj.setText();
            break;
          case "Tm":
            textobj.setMatrix();
            break;
          default:
            System.out.println("unsupported operation " + op);
          }
          textobj.clearAllAttributes();
        } else if (parsingTextObject) {
          textobj.addAttribute(next);
        } else {
          System.out.println("ignore "+next.getClass()+" -> "+next);
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }


  static class PDFTextObject{
      private List attributes = new ArrayList<Object>();
      private String text = "";
      private float x = -1;
      private float y = -1;

      public void clearAllAttributes(){
          attributes = new ArrayList<Object>();
      }

      public void addAttribute(Object anAttribute){
          attributes.add(anAttribute);
      }

      public void setText(){
          //Move the contents of the attributes to the text attribute.
          for (int i = 0; i < attributes.size(); i++){
              if (attributes.get(i) instanceof COSString){
                  COSString aString = (COSString) attributes.get(i);
                  text = text + aString.getString();
              }
              else {
                  System.out.println("Whoops! Wrong type of property...");
              }
          }
      }

      public String getText(){
          return text;
      }

      public void setMatrix(){
          //Move the contents of the attributes to the x and y attributes.
          //A Matrix has 6 attributes, the last two of which are x and y
          for (int i = 4; i < attributes.size(); i++){
              float curval = -1;
              if (attributes.get(i) instanceof COSInteger){
                  COSInteger aCOSInteger = (COSInteger) attributes.get(i); 
                  curval = aCOSInteger.floatValue();

              }
              if (attributes.get(i) instanceof COSFloat){
                  COSFloat aCOSFloat = (COSFloat) attributes.get(i);
                  curval = aCOSFloat.floatValue();
              }
              switch(i) {
                  case 4:
                      x = curval;
                      break;
                  case 5:
                      y = curval;
                      break;
              }
          }
      }

      public float getX(){
          return x;
      }

      public float getY(){
          return y;
      }
  }
}

使用PDFBox获取PDF TextObjects

2 个答案: