我有一个PDF,我使用PDFBox从中提取了一个页面:
(...)
File input = new File("C:\\temp\\sample.pdf");
document = PDDocument.load(input);
List allPages = document.getDocumentCatalog().getAllPages();
PDPage page = (PDPage) allPages.get(2);
PDStream contents = page.getContents();
if (contents != null) {
System.out.println(contents.getInputStreamAsString());
(...)
根据PDF spec,这给出了以下结果,看起来像您期望的那样。
q
/GS0 gs
/Fm0 Do
Q
/Span <</Lang (en-US)/MCID 88 >>BDC
BT
/CS0 cs 0 0 0 scn
/GS1 gs
/T1_0 1 Tf
8.5 0 0 8.5 70.8661 576 Tm
(This page has been intentionally left blank.)Tj
ET
EMC
1 1 1 scn
/GS0 gs
22.677 761.102 28.346 32.599 re
f
/Span <</Lang (en-US)/MCID 89 >>BDC
BT
0.531 0.53 0.528 scn
/T1_1 1 Tf
9 0 0 9 45.7136 761.1024 Tm
(2)Tj
ET
EMC
q
0 g
/Fm1 Do
Q
我正在寻找的是在页面上提取PDF TextObjects(如PDF规范的5.3中所述)作为java Objects,所以基本上是BT和ET之间的部分(本页的两个'en) 。 它们至少应该包含“Tj”之前的括号之间的所有字符串,以及基于“Tm”(或“Td”运算符等)的x和ycoördinate。其他属性是奖励,但不是必需的。
PDFTextStripper似乎给我带有属性的每个角色作为TextPosition(为我的目的太多噪音),或者所有Text作为一个长String。
PDFBox是否有一个功能可以解析一个页面并提供我错过的像这样的TextObjects?或者,如果我要扩展PDFBox以获得我需要的东西,我应该从哪里开始?欢迎任何帮助。
编辑:找到了另一个问题here,它为我如何构建我需要的东西提供了灵感。如果我成功了,我会回来看看。尽管如此,仍然期待着您的任何帮助。
谢谢,
菲尔答案 0 :(得分:6)
根据链接的问题和mkl昨天的提示(谢谢!),我决定构建一些东西来解析令牌。 要考虑的是在PDF文本对象中,属性在操作符之前,因此我收集集合中的所有属性,直到遇到操作符。 然后,当我知道属性属于哪个运算符时,我将它们移动到适当的位置。 这就是我提出的:
import java.io.File;
import java.util.List;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFOperator;
public class TextExtractor {
public static void main(String[] args) {
try {
File input = new File("C:\\some\\file.pdf");
PDDocument document = PDDocument.load(input);
List allPages = document.getDocumentCatalog().getAllPages();
// just parsing page 2 here, as it's only a sample
PDPage page = (PDPage) allPages.get(2);
PDStream contents = page.getContents();
PDFStreamParser parser = new PDFStreamParser(contents.getStream());
parser.parse();
List tokens = parser.getTokens();
boolean parsingTextObject = false; //boolean to check whether the token being parsed is part of a TextObject
PDFTextObject textobj = new PDFTextObject();
for (int i = 0; i < tokens.size(); i++)
{
Object next = tokens.get(i);
if (next instanceof PDFOperator) {
PDFOperator op = (PDFOperator) next;
switch(op.getOperation()){
case "BT":
//BT: Begin Text.
parsingTextObject = true;
textobj = new PDFTextObject();
break;
case "ET":
parsingTextObject = false;
System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
break;
case "Tj":
textobj.setText();
break;
case "Tm":
textobj.setMatrix();
break;
default:
//System.out.println("unsupported operation " + op.getOperation());
}
textobj.clearAllAttributes();
}
else if (parsingTextObject) {
textobj.addAttribute(next);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
结合:
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;
class PDFTextObject{
private List attributes = new ArrayList<Object>();
private String text = "";
private float x = -1;
private float y = -1;
public void clearAllAttributes(){
attributes = new ArrayList<Object>();
}
public void addAttribute(Object anAttribute){
attributes.add(anAttribute);
}
public void setText(){
//Move the contents of the attributes to the text attribute.
for (int i = 0; i < attributes.size(); i++){
if (attributes.get(i) instanceof COSString){
COSString aString = (COSString) attributes.get(i);
text = text + aString.getString();
}
else {
System.out.println("Whoops! Wrong type of property...");
}
}
}
public String getText(){
return text;
}
public void setMatrix(){
//Move the contents of the attributes to the x and y attributes.
//A Matrix has 6 attributes, the last two of which are x and y
for (int i = 4; i < attributes.size(); i++){
float curval = -1;
if (attributes.get(i) instanceof COSInteger){
COSInteger aCOSInteger = (COSInteger) attributes.get(i);
curval = aCOSInteger.floatValue();
}
if (attributes.get(i) instanceof COSFloat){
COSFloat aCOSFloat = (COSFloat) attributes.get(i);
curval = aCOSFloat.floatValue();
}
switch(i) {
case 4:
x = curval;
break;
case 5:
y = curval;
break;
}
}
}
public float getX(){
return x;
}
public float getY(){
return y;
}
}
它给出了输出:
Text: This page has been intentionally left blank.@70.8661,576.0
Text: 2@45.7136,761.1024
虽然它可以解决问题,但我确信我已经破坏了一些惯例并且总是编写最优雅的代码。欢迎改进和替代解决方案。
答案 1 :(得分:4)
我在pdfbox-2.0.1
中添加了Phil响应版本import java.io.File;
import java.util.ArrayList;
import java.util.List;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSFloat;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSString;
public class TextExtractor {
public static void main(String[] args) {
try {
File input = new File("src\\test\\resources\\files\\file1.pdf");
PDDocument document = PDDocument.load(input);
PDPageTree allPages = document.getDocumentCatalog().getPages();
// just parsing page 2 here, as it's only a sample
PDPage page = allPages.get(0);
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List tokens = parser.getTokens();
boolean parsingTextObject = false; // boolean to check whether the token
// being parsed is part of a TextObject
PDFTextObject textobj = new PDFTextObject();
for (int i = 0; i < tokens.size(); i++) {
Object next = tokens.get(i);
if (next instanceof Operator) {
Operator op = (Operator) next;
switch (op.getName()) {
case "BT":
// BT: Begin Text.
parsingTextObject = true;
textobj = new PDFTextObject();
break;
case "ET":
parsingTextObject = false;
System.out.println("Text: " + textobj.getText() + "@" + textobj.getX() + "," + textobj.getY());
break;
case "Tj":
textobj.setText();
break;
case "Tm":
textobj.setMatrix();
break;
default:
System.out.println("unsupported operation " + op);
}
textobj.clearAllAttributes();
} else if (parsingTextObject) {
textobj.addAttribute(next);
} else {
System.out.println("ignore "+next.getClass()+" -> "+next);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
static class PDFTextObject{
private List attributes = new ArrayList<Object>();
private String text = "";
private float x = -1;
private float y = -1;
public void clearAllAttributes(){
attributes = new ArrayList<Object>();
}
public void addAttribute(Object anAttribute){
attributes.add(anAttribute);
}
public void setText(){
//Move the contents of the attributes to the text attribute.
for (int i = 0; i < attributes.size(); i++){
if (attributes.get(i) instanceof COSString){
COSString aString = (COSString) attributes.get(i);
text = text + aString.getString();
}
else {
System.out.println("Whoops! Wrong type of property...");
}
}
}
public String getText(){
return text;
}
public void setMatrix(){
//Move the contents of the attributes to the x and y attributes.
//A Matrix has 6 attributes, the last two of which are x and y
for (int i = 4; i < attributes.size(); i++){
float curval = -1;
if (attributes.get(i) instanceof COSInteger){
COSInteger aCOSInteger = (COSInteger) attributes.get(i);
curval = aCOSInteger.floatValue();
}
if (attributes.get(i) instanceof COSFloat){
COSFloat aCOSFloat = (COSFloat) attributes.get(i);
curval = aCOSFloat.floatValue();
}
switch(i) {
case 4:
x = curval;
break;
case 5:
y = curval;
break;
}
}
}
public float getX(){
return x;
}
public float getY(){
return y;
}
}
}