在提取文本位置时从PdfBox获得的负X或Y.

时间:2018-02-02 15:48:47

标签: java pdf pdfbox

我正在尝试提取pdf中的所有文本及其坐标。 我正在使用 Apache PDFBox 2.0.8 并遵循示例程序DrawPrintTextLocations

它似乎主要起作用,但对于某些pdf-s我得到边界框的x和y坐标的负值。例如,请参阅此pdf file

我的应用假设坐标系为普通pdf(x从左到右,y从上到下)。所以这些都是我的计算结果。

以下是相关的代码。

import org.apache.fontbox.util.BoundingBox;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType3Font;
import org.apache.pdfbox.pdmodel.interactive.pagenavigation.PDThreadBead;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;

/**
 * This is an example on how to get some x/y coordinates of text and to show them in a rendered
 * image.
 *
 * @author Ben Litchfield
 * @author Tilman Hausherr
 */
public class DrawPrintTextLocations extends PDFTextStripper {
    private AffineTransform flipAT;
    private AffineTransform rotateAT;
    private AffineTransform transAT;

    private final float DPI = 200.0f;
    private final double PT2PX = DPI / 72.0;
    private final AffineTransform dpiAT = AffineTransform.getScaleInstance(PT2PX, PT2PX);

    private final String filename;
    static final int SCALE = 1;
    private Graphics2D g2d;
    private final PDDocument document;

    /**
     * Instantiate a new PDFTextStripper object.
     *
     * @param document
     * @param filename
     * @throws IOException If there is an error loading the properties.
     */
    public DrawPrintTextLocations(PDDocument document, String filename) throws IOException {
        this.document = document;
        this.filename = filename;
    }

    /**
     * This will print the documents data.
     *
     * @param args The command line arguments.
     * @throws IOException If there is an error parsing the document.
     */
    public static void main(String[] args) throws IOException {
        String pdfLoc = "/debug/pdfbox/p2_VS008PI.pdf";

        if (args.length == 1) {
            pdfLoc = args[0];
        }

        try (PDDocument document = PDDocument.load(new File(pdfLoc))) {
            DrawPrintTextLocations stripper = new DrawPrintTextLocations(document, pdfLoc);
            stripper.setSortByPosition(true);

            for (int page = 0; page < document.getNumberOfPages(); ++page) {
                stripper.stripPage(page);
            }
        }
    }

    private void stripPage(int page) throws IOException {
        PDFRenderer pdfRenderer = new PDFRenderer(document);
        BufferedImage image = pdfRenderer.renderImageWithDPI(page, DPI);

        PDPage pdPage = document.getPage(page);
        PDRectangle cropBox = pdPage.getCropBox();

        // flip y-axis
        flipAT = new AffineTransform();
        flipAT.translate(0, pdPage.getBBox().getHeight());
        flipAT.scale(1, -1);

        // page may be rotated
        rotateAT = new AffineTransform();
        int rotation = pdPage.getRotation();
        if (rotation != 0) {
            PDRectangle mediaBox = pdPage.getMediaBox();
            switch (rotation) {
                case 90:
                    rotateAT.translate(mediaBox.getHeight(), 0);
                    break;
                case 270:
                    rotateAT.translate(0, mediaBox.getWidth());
                    break;
                case 180:
                    rotateAT.translate(mediaBox.getWidth(), mediaBox.getHeight());
                    break;
                default:
                    break;
            }
            rotateAT.rotate(Math.toRadians(rotation));
        }

        // cropbox
        transAT = AffineTransform.getTranslateInstance(-cropBox.getLowerLeftX(), cropBox.getLowerLeftY());

        g2d = image.createGraphics();
        g2d.setStroke(new BasicStroke(0.1f));
        g2d.scale(SCALE, SCALE);

        setStartPage(page + 1);
        setEndPage(page + 1);

        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        writeText(document, dummy);

        g2d.dispose();

        String imageFilename = filename;
        int pt = imageFilename.lastIndexOf('.');
        imageFilename = imageFilename.substring(0, pt) + "-marked-" + (page + 1) + ".png";
        ImageIO.write(image, "png", new File(imageFilename));
    }

    /**
     * Override the default functionality of PDFTextStripper.
     */
    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {

        for (TextPosition text : textPositions) {

            AffineTransform at = text.getTextMatrix().createAffineTransform();
            PDFont font = text.getFont();

            BoundingBox bbox = font.getBoundingBox();

            float xadvance = font.getWidth(text.getCharacterCodes()[0]); // todo: should iterate all chars
            Rectangle2D.Float rect1 = new Rectangle2D.Float(0, bbox.getLowerLeftY(), xadvance, bbox.getHeight());

            if (font instanceof PDType3Font) {
                at.concatenate(font.getFontMatrix().createAffineTransform());
            } else {
                at.scale(1 / 1000f, 1 / 1000f);
            }

            Shape s1 = at.createTransformedShape(rect1);
            s1 = flipAT.createTransformedShape(s1);
            s1 = rotateAT.createTransformedShape(s1);
            s1 = dpiAT.createTransformedShape(s1);

            g2d.setColor(Color.blue);
            g2d.draw(s1);

            Rectangle bounds = s1.getBounds();
            if (bounds.getX() < 0 || bounds.getY() < 0) {
                // THIS is where things go wrong
                // i need these coordinates to be +ve
                System.out.println(bounds.toString());
                System.out.println(rect1.toString());
            }
        }
    }
}

以下是上述pdf第一页输出的一些片段。

  

第10节 - 保险&amp;其他财政资源   java.awt.Rectangle中[X = -3237,Y = 40,宽度= 19,高度= 43]   java.awt.Rectangle中[X = -3216,Y = 40,宽度= 20,高度= 43]   java.awt.Rectangle中[X = -3194,Y = 40,宽度= 23,高度= 43]   java.awt.Rectangle中[X = -3170,Y = 40,宽度= 22,高度= 43]

1 个答案:

答案 0 :(得分:1)

具有负坐标的字符位于裁剪框之外(也是坐标大于裁剪框高度/宽度的字符)。将cropbox视为较大的切口。要查看整个内容,请运行此代码

pdPage.setCropBox(pdPage.getMediaBox());

对于PDF的每个页面,然后保存并查看它。

根据你的评论

  

按照你的建议将裁剪框设置到媒体框,实际上改变了pdf的整个屏幕外观,现在我整理了3页。

这表明在物理上,这是一张折叠的纸张,每面有3页。在线PDF显示为6页,便于在计算机上查看。