我正在尝试提取pdf中的所有文本及其坐标。 我正在使用 Apache PDFBox 2.0.8 并遵循示例程序DrawPrintTextLocations 。
它似乎主要起作用,但对于某些pdf-s我得到边界框的x和y坐标的负值。例如,请参阅此pdf file。
我的应用假设坐标系为普通pdf(x从左到右,y从上到下)。所以这些都是我的计算结果。
以下是相关的代码。
import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;
/**
* This is an example on how to get some x/y coordinates of text and to show them in a rendered
* image.
*
* @author Ben Litchfield
* @author Tilman Hausherr
*/
public class DrawPrintTextLocations extends PDFTextStripper {
private AffineTransform flipAT;
private AffineTransform rotateAT;
private AffineTransform transAT;
private final float DPI = 200.0f;
private final double PT2PX = DPI / 72.0;
private final AffineTransform dpiAT = AffineTransform.getScaleInstance(PT2PX, PT2PX);
private final String filename;
static final int SCALE = 1;
private Graphics2D g2d;
private final PDDocument document;
/**
* Instantiate a new PDFTextStripper object.
*
* @param document
* @param filename
* @throws IOException If there is an error loading the properties.
*/
public DrawPrintTextLocations(PDDocument document, String filename) throws IOException {
this.document = document;
this.filename = filename;
}
/**
* This will print the documents data.
*
* @param args The command line arguments.
* @throws IOException If there is an error parsing the document.
*/
public static void main(String[] args) throws IOException {
String pdfLoc = "/debug/pdfbox/p2_VS008PI.pdf";
if (args.length == 1) {
pdfLoc = args[0];
}
try (PDDocument document = PDDocument.load(new File(pdfLoc))) {
DrawPrintTextLocations stripper = new DrawPrintTextLocations(document, pdfLoc);
stripper.setSortByPosition(true);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
stripper.stripPage(page);
}
}
}
private void stripPage(int page) throws IOException {
PDFRenderer pdfRenderer = new PDFRenderer(document);
BufferedImage image = pdfRenderer.renderImageWithDPI(page, DPI);
PDPage pdPage = document.getPage(page);
PDRectangle cropBox = pdPage.getCropBox();
// flip y-axis
flipAT = new AffineTransform();
flipAT.translate(0, pdPage.getBBox().getHeight());
flipAT.scale(1, -1);
// page may be rotated
rotateAT = new AffineTransform();
int rotation = pdPage.getRotation();
if (rotation != 0) {
PDRectangle mediaBox = pdPage.getMediaBox();
switch (rotation) {
case 90:
rotateAT.translate(mediaBox.getHeight(), 0);
break;
case 270:
rotateAT.translate(0, mediaBox.getWidth());
break;
case 180:
rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
break;
default:
break;
}
rotateAT.rotate(Math.toRadians(rotation));
}
// cropbox
transAT = AffineTransform.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY());
g2d = image.createGraphics();
g2d.setStroke(new BasicStroke(0.1f));
g2d.scale(SCALE, SCALE);
setStartPage(page + 1);
setEndPage(page + 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
writeText(document, dummy);
g2d.dispose();
String imageFilename = filename;
int pt = imageFilename.lastIndexOf('.');
imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png";
ImageIO.write(image, "png", new File(imageFilename));
}
/**
* Override the default functionality of PDFTextStripper.
*/
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (TextPosition text : textPositions) {
AffineTransform at = text.getTextMatrix().createAffineTransform();
PDFont font = text.getFont();
BoundingBox bbox = font.getBoundingBox();
float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
Rectangle2D.Float rect1 = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());
if (font instanceof PDType3Font) {
at.concatenate(font.getFontMatrix().createAffineTransform());
} else {
at.scale(1 / 1000f, 1 / 1000f);
}
Shape s1 = at.createTransformedShape(rect1);
s1 = flipAT.createTransformedShape(s1);
s1 = rotateAT.createTransformedShape(s1);
s1 = dpiAT.createTransformedShape(s1);
g2d.setColor(Color.blue);
g2d.draw(s1);
Rectangle bounds = s1.getBounds();
if (bounds.getX() < 0 || bounds.getY() < 0) {
// THIS is where things go wrong
// i need these coordinates to be +ve
System.out.println(bounds.toString());
System.out.println(rect1.toString());
}
}
}
}
以下是上述pdf第一页输出的一些片段。
第10节 - 保险&amp;其他财政资源 java.awt.Rectangle中[X = -3237,Y = 40,宽度= 19,高度= 43] java.awt.Rectangle中[X = -3216,Y = 40,宽度= 20,高度= 43] java.awt.Rectangle中[X = -3194,Y = 40,宽度= 23,高度= 43] java.awt.Rectangle中[X = -3170,Y = 40,宽度= 22,高度= 43]
答案 0 :(得分:1)
具有负坐标的字符位于裁剪框之外(也是坐标大于裁剪框高度/宽度的字符)。将cropbox视为较大的切口。要查看整个内容,请运行此代码
pdPage.setCropBox(pdPage.getMediaBox());
对于PDF的每个页面,然后保存并查看它。
根据你的评论
按照你的建议将裁剪框设置到媒体框,实际上改变了pdf的整个屏幕外观,现在我整理了3页。
这表明在物理上,这是一张折叠的纸张,每面有3页。在线PDF显示为6页,便于在计算机上查看。