PDFBox 2.0.8 - 从一个文档中提取图像并在另一个文档中使用它

时间:2018-03-02 12:30:27

标签: java pdfbox

我正在编写一个Java应用程序来充当模板读写器。我在处理文本方面取得了成功,但对图像有一些不足......

使用扩展PDFStreamEngine的类

获取图像非常简单
package readingPdf;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.util.Matrix;

public class ImageStripper extends PDFStreamEngine {

    ArrayList<Object  []> imagesData = null;
    public ImageStripper() throws IOException {
        // preparing PDFStreamEngine
        addOperator(new Concatenate());
        addOperator(new DrawObject());
        addOperator(new SetGraphicsStateParameters());
        addOperator(new Save());
        addOperator(new Restore());
        addOperator(new SetMatrix());
        imagesData = new ArrayList<Object[]>();
    }

    @Override
    protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
        String operation = operator.getName();
        if ("Do".equals(operation)) {
            COSName objectName = (COSName) operands.get(0);
            // get the PDF object
            PDXObject xobject = getResources().getXObject(objectName);
            // check if the object is an image object
            if (xobject instanceof PDImageXObject) {
                Object[] imageData = new Object[3];
                PDImageXObject image = (PDImageXObject) xobject;

                Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();

                // position of image in the pdf in terms of user space units
                System.out.println("position in PDF = " + ctmNew.getTranslateX() + ", " + ctmNew.getTranslateY()
                        + " in user space units");

                imageData[0] = ctmNew.getTranslateX();// xPos
                imageData[1] = ctmNew.getTranslateY();// yPos

                imageData[2] = image;//Image

                imagesData.add(imageData);

            } else if (xobject instanceof PDFormXObject) {
                PDFormXObject form = (PDFormXObject) xobject;
                showForm(form);
            }
        } else {
            super.processOperator(operator, operands);
        }
    }

    public ArrayList<Object[]> getImagesList(){
        return imagesData;
    }
}

接下来是其实施

public class PDFManager{

    private PDFParser parser;
    private PDDocument pdDoc;
    private PDDocument retDoc;
    private COSDocument cosDoc;
    private PDPage page;
    private String filePath;
    private File file; 

    public PDDocument transferImage() throws IOException {
        this.pdDoc = null;
        this.cosDoc = null;

        file = new File(filePath);
        parser = new PDFParser(new RandomAccessFile(file, "r"));
        parser.parse();
        cosDoc = parser.getDocument();
        pdDoc = new PDDocument(cosDoc);

        //Get Image Data
        ImageStripper imageStripper = new ImageStripper();
        imageStripper.processPage(pdDoc.getPage(0));
        ArrayList<Object []> imageList = imageStripper.getImagesList();

        //Close Doc
        pdDoc.close();
        cosDoc.close();

        //Create new PDF Doc
        retDoc = new PDDocument();
        page = new PDPage(new PDRectangle(PDRectangle.A4.getHeight(), PDRectangle.A4.getWidth())); 
        retDoc.addPage(page);

        PDPageContentStream cs = new PDPageContentStream(retDoc, page, AppendMode.OVERWRITE, true);

        for(int pos = 0; pos < imageList.size() ; pos++) {
            Object [] imageData = imageList.get(pos);

            float xPos = (float)imageData[0];
            float yPos = (float)imageData[1];
            PDImageXObject image = (PDImageXObject)imageData[2];
            cs.drawImage(image, xPos, yPos);
        }

        cs.close();
        return retDoc;
    }

    public static void main(String[] args) throws IOException {

        PDFManager pdfManager = new PDFManager();

        PDDocument doc =pdfManager.ToText("c:\\test\\test.pdf"); 

        doc.save("c:\\test\\test2.pdf");
        doc.close();
    }
}

现在问题就出现在我写cs.drawImage的时候。除了尝试保存新文件外,所有代码都没有任何问题执行...我得到例外COSStream has been closed and cannot be read. Perhaps its enclosing PDDocument has been closed?

我怀疑仍有元数据将图像链接到原始文档,因为调用PDImageXobject.createFromFile("c:\\test\\testImage.png", doc)会返回一个新的PDImageXObject实例,它可以完美地写入。当写入的PDDocument被传递到PDImageXObject时,我怀疑它会以某种方式链接。

我无法将图像保存到临时位置,因为这只是测试POC。

任何帮助将不胜感激

1 个答案:

答案 0 :(得分:1)

@ Tilman Hausherr

感谢您的解决方案

我将原始文档的结束移动到一个单独的方法,我在写完文件后调用了

public void closeFiles(){
    pdDoc.close();
    cosDoc.close();
}