我需要在pdf文件中找到(光栅)图像并调整其大小 (以更改其分辨率)。
我的代码基于示例PrintImageLocations。对于部分 工作正常,我扩展了PDFStreamEngine并为 “执行”运算符:
我使用的第一个参数获取原始的PDImageXObject 运算符和资源。
然后我从中创建一个BufferedImage并对其进行更改 像素数。
然后我通过BufferedImage创建一个新的PDImageXObject LosslessFactory
最后,我使用来将新对象放入页面资源中 原始对象的名称
我尝试对内嵌图片执行类似的操作,然后到达 指向我有BufferedImage的位置,但是我不知道如何使用它 替换原始的嵌入式图片。
将内联图像替换为XObject也可以,但是 再次,我不知道如何替换两者...
下面是我的代码;有趣的部分是函数“ processOperator”。
// WIP!
// find raster images inside a pdf
// if their resolution is more than 900dpi
// then resize them
// reducing the resolution to 200dpi
// NB bug: fails on pdf files with more than one page
// ...DEBUG ScratchFileBuffer:516 - ScratchFileBuffer not closed!
// also fails on pdf with included pdf
// (e.g. latex \includegraphics{x.pdf})
// # to compile:
// apt install libpdfbox2-java
// export CLASSPATH=.:/usr/share/java/pdfbox2.jar:/usr/share/java/commons-logging.jar
// javac Resampleimages.java
// # to run:
// java Resampleimages x.pdf
// see
// https://pdfbox.apache.org/2.0/examples.html
// https://pdfbox.apache.org/docs/2.0.11/javadocs/
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDInlineImage;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.awt.image.BufferedImage;
import java.awt.Graphics2D;
import java.awt.RenderingHints;
import java.awt.Color;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class ReplaceBigImages extends PDFStreamEngine
{
private static Log log = LogFactory.getLog(ReplaceBigImages.class);
public ReplaceBigImages() throws IOException
{
addOperator(new Concatenate());
addOperator(new DrawObject());
addOperator(new SetGraphicsStateParameters());
addOperator(new Save());
addOperator(new Restore());
addOperator(new SetMatrix());
}
public static PDDocument document;
public static void main( String[] args ) throws IOException
{
if( args.length != 1 )
{
usage();
}
else
{
try
{
document = PDDocument.load(new File(args[0]));
ReplaceBigImages printer = new ReplaceBigImages();
int pageNum = 0;
for( PDPage page : document.getPages() )
{
pageNum++;
log.info( "Processing page: " + pageNum );
printer.processPage(page);
}
}
finally {
if( document != null )
{
document.save(args[0].replace(".pdf", "_test.pdf"));
document.close();
}
}
}
}
protected void processOperator( Operator operator, List<COSBase> operands) throws IOException
{
String operation = operator.getName();
// log.debug(String.format("Operator %s", operation));
if( "Do".equals(operation) ) {
log.debug("### Found Do operator");
COSName objectName = (COSName) operands.get( 0 );
PDXObject xobject = getResources().getXObject( objectName );
// log.debug(String.format("%s isa %s", objectName, xobject.getClass().getSimpleName()));
if( xobject instanceof PDImageXObject)
{
log.debug(String.format("Looking at %s (%s)", objectName.getName(), xobject));
PDImageXObject image = (PDImageXObject)xobject;
BufferedImage scaledImage = changeImageResolution(image);
if (scaledImage != null) {
log.debug(String.format("Replacing with %s", scaledImage));
PDImageXObject replacement_img = LosslessFactory.createFromImage(document, scaledImage);
PDPage currentPage = getCurrentPage();
PDResources resources = currentPage.getResources();
resources.put(objectName, replacement_img);
}
}else if(xobject instanceof PDFormXObject)
{
PDFormXObject form = (PDFormXObject)xobject;
showForm(form);
}
} else if ("BI".equals(operation)) {
PDPage currentPage = getCurrentPage();
log.debug("### Found BI operator");
PDResources resources = currentPage.getResources();
PDInlineImage image = new PDInlineImage(operator.getImageParameters(),
operator.getImageData(),
resources);
BufferedImage scaledImage = changeImageResolution(image);
if (scaledImage != null) {
log.debug(String.format("Replacing with %s", scaledImage));
PDImageXObject replacement_img = LosslessFactory.createFromImage(document, scaledImage);
// ARGH!!! How do I replace the inline image???
resources.add(replacement_img, "pippo");
// operator.setImageParameters(scaledImage???)
// operator.setImageData(scaledImage???)
}
} else {
super.processOperator( operator, operands);
}
}
protected BufferedImage changeImageResolution( PDImage image)
throws IOException
{
int imageWidth = image.getWidth();
int imageHeight = image.getHeight();
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
float imageXScale = Math.abs(ctmNew.getScalingFactorX());
float imageYScale = Math.abs(ctmNew.getScalingFactorY());
float resolution = imageWidth / ( imageXScale / 72 );
String stencil = "";
if (image.isStencil()) {
stencil = " (stencil)";
}
// TODO: take into consideration the size at which this file is included by TeX
log.debug("size: ("+imageWidth+","+imageHeight+")@("+imageXScale+","+imageYScale+") resolution = "+resolution+stencil);
// if ( resolution > 899f ) {
if ( resolution > 200f ) {
// what do the following two lines mean???
BufferedImage bImage = new BufferedImage(imageWidth,
imageHeight,
BufferedImage.TYPE_INT_ARGB);
if (image.isStencil()) {
log.warn("Is stencil; painting black.");
bImage = image.getStencilImage(Color.black);
} else {
bImage = image.getImage();
}
int desiredResolution = 200;
float xFactor = (imageXScale / 72) * desiredResolution / imageWidth;
float yFactor = (imageYScale / 72) * desiredResolution / imageHeight;
log.info("Scaling x to "+xFactor);
int dWidth = (int) (xFactor * imageWidth);
int dHeight = (int) (yFactor * imageHeight);
// the image type is from
// https://docs.oracle.com/javase/6/docs/api/constant-values.html#java.awt.image.
log.debug(String.format("Destination: %d x %d [%s]",
dWidth,
dHeight,
bImage.getType()));
BufferedImage scaledImage = new BufferedImage(dWidth,
dHeight,
bImage.getType());
Graphics2D graphics2D = scaledImage.createGraphics();
graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION,
RenderingHints.VALUE_INTERPOLATION_BILINEAR);
graphics2D.setRenderingHint(RenderingHints.KEY_RENDERING,
RenderingHints.VALUE_RENDER_QUALITY);
graphics2D.setRenderingHint(RenderingHints.KEY_ANTIALIASING,
RenderingHints.VALUE_ANTIALIAS_ON);
graphics2D.drawImage(bImage, 0, 0, dWidth, dHeight, null);
graphics2D.dispose();
// see https://pdfbox.apache.org/docs/2.0.11/javadocs/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.html#createFromByteArray-org.apache.pdfbox.pdmodel.PDDocument-byte:A-java.lang.String-
return scaledImage;
}
return null;
}
private static void usage()
{
System.err.println( "Usage: java " + ReplaceBigImages.class.getName() + " <input-pdf>" );
}
}
答案 0 :(得分:1)
在Tilman Hausherr的帮助下(详细信息here),我认为我有一段代码可以调整InlineImages和XObjects的大小。 以下代码中的方法是:
代码如下:
package it.sissa.medialab.pdfimages;
// Find raster images inside a pdf
// if their resolution is outside of a given threshold
// then resize them
// reducing the resolution to a given target resolution.
// If the pdf will be included inside another pdf
// (as with TeX \includegraphics)
// one can provide a --scale factor
// Usage
// -----
// java -Dlog4j.configurationFile=/tmp/log4j.xml \
// it/sissa/medialab/pdfimages/ReplaceBigImages \
// --scale=XXX -v \
// pesky-file.pdf
// CLASSPATH
// ---------
// This script uses pdfbox, picocli (for the command line), commons-io, commons-logging and log4j
// So the CLASSPATH could be:
// export CLASSPATH=$HOME/.m2/repository/org/apache/pdfbox/pdfbox/2.0.14/pdfbox-2.0.14.jar:$HOME/.m2/repository/commons-logging/commons-logging/1.2/commons-logging-1.2.jar:$HOME/.m2/repository/commons-io/commons-io/2.6/commons-io-2.6.jar:$HOME/.m2/repository/info/picocli/picocli/3.9.5/picocli-3.9.5.jar:$HOME/.m2/repository/org/apache/logging/log4j/log4j-api/2.11.2/log4j-api-2.11.2.jar:$HOME/.m2/repository/org/apache/logging/log4j/log4j-core/2.11.2/log4j-core-2.11.2.jar:$HOME/typeset/scripts/pdfimages/target/classes/
// log4j.xml (example)
//
// <?xml version="1.0" encoding="UTF-8"?>
// <Configuration status="info" strict="true" name="XMLConfigTest" packages="org.apache.logging.log4j.test">
// <Appenders>
// <Appender type="Console" name="STDOUT">
// <Layout type="PatternLayout" pattern="%-5p %c{1}:%L %M - %m%n"/>
// </Appender>
// </Appenders>
// <Loggers>
// <Logger name="pippo"
// level="info"
// additivity="false">
// <AppenderRef ref="STDOUT"/>
// </Logger>
// <Root level="info">
// <AppenderRef ref="STDOUT"/>
// </Root>
// </Loggers>
// </Configuration>
// export CLASSPATH=$HOME/.m2/repository/org/apache/pdfbox/pdfbox/2.0.14/pdfbox-2.0.14.jar:$HOME/.m2/repository/commons-logging/commons-logging/1.2/commons-logging-1.2.jar:$HOME/.m2/repository/commons-io/commons-io/2.6/commons-io-2.6.jar:$HOME/.m2/repository/info/picocli/picocli/3.9.5/picocli-3.9.5.jar:$HOME/.m2/repository/org/apache/logging/log4j/log4j-api/2.11.2/log4j-api-2.11.2.jar:$HOME/.m2/repository/org/apache/logging/log4j/log4j-core/2.11.2/log4j-core-2.11.2.jar:./target/classes
// java -Dlog4j.configurationFile=log4j.xml it/sissa/medialab/pdfimages/ReplaceBigImages --max=220 --min=200 -v --target=220 src/resources/META-INF/test-files/d.pdf
import java.awt.Color;
import java.awt.Graphics2D;
import java.awt.RenderingHints;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
import javax.imageio.ImageIO;
import org.apache.commons.io.IOUtils;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.core.config.Configurator;
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDInlineImage;
import org.apache.pdfbox.util.Matrix;
import picocli.CommandLine;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;
import picocli.CommandLine.Parameters;
@Command(description = "Resample raster images in the given pdf file when their resolution is outside the given min-max range.",
name = "pdfimage",
mixinStandardHelpOptions = true,
version = "pdfimages 1.2")
public class ReplaceBigImages
extends PDFStreamEngine
implements Runnable
{
// private static Log log = LogFactory.getLog(ReplaceBigImages.class);
private static Logger log = LogManager.getLogger(ReplaceBigImages.class);
public boolean hasInlineImages = false;
public LinkedBlockingQueue<Matrix> tms = new LinkedBlockingQueue<Matrix>();
@Parameters(paramLabel = "FILE", description = "File to process.")
private File inputFile;
@Option(names = "--min", description = "Resolution lower bound; defaults to 150", required = false)
private float min_res=150;
@Option(names = "--max", description = "Resolution upper bound; defaults to 220", required = false)
private float max_res=220;
@Option(names = "--target", description = "Desired resolution (only if the current resolution exeedes the bounds). Defaults to 220", required = false)
private int desiredResolution = 220;
@Option(names = "--scale", description = "Scale factor. The options min, max and target will be multiplied by this factor. This should be the same scale at which the pdf \"image\" under analysis will be included by the tex file into the final pdf. See 00-readme.txt. Defaults to 1.0", required = false)
private float scale=1;
@Option(names = { "-v", "--verbose" }, description = "Be verbose.")
private boolean verbose = false;
private int myCounter = 0;
public ReplaceBigImages() throws IOException
{
addOperator(new Concatenate());
addOperator(new DrawObject());
addOperator(new SetGraphicsStateParameters());
addOperator(new Save());
addOperator(new Restore());
addOperator(new SetMatrix());
}
@Override
public void run() {
if (verbose) {
if (log instanceof Logger) {
// https://stackoverflow.com/a/44678752/1581629
Configurator.setLevel(LogManager.getLogger(ReplaceBigImages.class).getName(), Level.DEBUG);
} else {
log.warn("Unknown logger " + log.getClass().toString() + "; ignoring option \"verbose\".");
log.warn("I only know about " + Logger.class.toString());
}
}
min_res = (float) (min_res * scale);
max_res = (float) (max_res * scale);
desiredResolution = (int) (desiredResolution * scale);
try {
document = PDDocument.load(inputFile);
int pageNum = 0;
for (PDPage page : document.getPages()) {
pageNum++;
log.info("Processing page: " + pageNum);
processPage(page);
if (hasInlineImages) {
replaceInlineImages(document, page, tms.remove());
}
}
} catch (Exception e) {
log.error(e);
} finally {
if (document != null) {
log.debug("Ready to save");
try {
document.save(inputFile.toPath().toString().replace(".pdf", "-m.pdf"));
log.debug("Document saved");
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static PDDocument document;
public static void main( String[] args ) throws IOException
{
ReplaceBigImages app = new ReplaceBigImages();
CommandLine.run(app, args);
}
protected void processOperator( Operator operator, List<COSBase> operands) throws IOException
{
String operation = operator.getName();
if( "Do".equals(operation) ) {
log.debug("### Found Do operator");
COSName objectName = (COSName) operands.get( 0 );
PDXObject xobject = getResources().getXObject( objectName );
if( xobject instanceof PDImageXObject)
{
log.debug(String.format("Looking at %s (%s)", objectName.getName(), xobject));
PDImageXObject image = (PDImageXObject)xobject;
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
BufferedImage scaledImage = changeImageResolution(image, ctmNew);
if (scaledImage != null) {
log.debug(String.format("Replacing with %s", scaledImage));
PDImageXObject replacement_img = LosslessFactory.createFromImage(document, scaledImage);
replacement_img.setStencil(image.isStencil());
PDPage currentPage = getCurrentPage();
PDResources resources = currentPage.getResources();
resources.put(objectName, replacement_img);
}
} else if(xobject instanceof PDFormXObject)
{
PDFormXObject form = (PDFormXObject)xobject;
showForm(form);
}
} else if ("BI".equals(operation)) {
hasInlineImages = true;
// save the current transformation matrix
// I will need it later, to compute the resolution of the inline image
Matrix ctmNew = getGraphicsState().getCurrentTransformationMatrix();
tms.add(ctmNew);
} else {
super.processOperator( operator, operands);
}
}
protected void replaceInlineImages (PDDocument document, PDPage page, Matrix ctm) throws Exception {
// take the content stream of the current page,
// parse it, get a list of tokens
// run through the list of tokens
// and build a new list
// when a BI operator is found,
// change the resolution of the relative inline image
// and replace the "BI" operator with the new one
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<Object> tokens = parser.getTokens();
List<Object> newTokens = new ArrayList<Object>();
for(int j=0; j<tokens.size(); j++) {
Object token = tokens.get( j );
if( token instanceof Operator ) {
Operator operator = (Operator)token;
if("BI".equals(operator.getName())) {
log.debug("### Found BI operator");
// NB: there is not trace of ID or EI operators in the original token list
// so just replacing the BI should replace the whole block BI-ID-EI
PDResources resources = page.getResources();
PDInlineImage image = new PDInlineImage(operator.getImageParameters(),
operator.getImageData(),
resources);
BufferedImage scaledImage = changeImageResolution(image, ctm);
if (scaledImage != null) {
log.debug(String.format("Replacing with %s", scaledImage));
PDImageXObject helper_img = LosslessFactory.createFromImage(document, scaledImage);
helper_img.setStencil(image.isStencil());
InputStream img_data_stream = helper_img.getCOSObject().createRawInputStream();
byte [] img_data = IOUtils.toByteArray(img_data_stream);
img_data_stream.close();
Operator newBIoperator = Operator.getOperator("BI");
newBIoperator.setImageParameters(helper_img.getCOSObject());
newBIoperator.setImageData(img_data);
newTokens.add(newBIoperator);
continue;
}
}
}
newTokens.add( token );
}
// use newTokens to build a new stream
PDStream newContents = new PDStream( document );
OutputStream outputStream = newContents.createOutputStream();
ContentStreamWriter writer = new ContentStreamWriter( outputStream );
writer.writeTokens( newTokens );
page.setContents( newContents );
outputStream.close();
log.debug("New contents set for page");
}
protected BufferedImage changeImageResolution( PDImage image, Matrix ctmNew)
throws IOException
{
int imageWidth = image.getWidth();
int imageHeight = image.getHeight();
float imageXScale = Math.abs(ctmNew.getScalingFactorX());
float imageYScale = Math.abs(ctmNew.getScalingFactorY());
float resolution = imageWidth / (imageXScale / 72);
int bufferedImageType = BufferedImage.TYPE_INT_RGB;
String stencil = "";
if (image.isStencil()) {
stencil = " (stencil)";
bufferedImageType = BufferedImage.TYPE_BYTE_BINARY;
} else if (image.getColorSpace() == PDDeviceGray.INSTANCE) {
bufferedImageType = BufferedImage.TYPE_BYTE_GRAY;
}
// TODO: take into consideration the size at which this file is included by TeX
log.debug("Original size: (" + imageWidth + "," + imageHeight + ")@scale(" + imageXScale + "," + imageYScale
+ "); resolution = (" + min_res + "<)" + resolution + "(<" + max_res + ")" + stencil);
if (resolution > max_res || resolution < min_res) {
BufferedImage bImage = new BufferedImage(imageWidth, imageHeight, bufferedImageType);
bImage.setData(image.getImage().getData());
// do not do "bImage = image.getImage()" or the image type will be wrong
if (image.isStencil()) {
log.warn("Image is stencil, plese check.");
// bImage = image.getStencilImage(Color.red);
// ↑ No: see https://lists.apache.org/thread.html/8d6477f6e057b83ab34655041045dc9e0288b4eeba6d65b52a92ab52@%3Cusers.pdfbox.apache.org%3E
}
if (log.isDebugEnabled()) {
myCounter += 1;
log.debug("Saving original img to \"original-"+myCounter+".png\"");
File outputfile = new File("original-"+myCounter+".png");
ImageIO.write(bImage, "png", outputfile);
}
if (desiredResolution == -1) {
desiredResolution = (int) ((max_res + min_res) / 2.0);
}
float xFactor = (imageXScale / 72) * desiredResolution / imageWidth;
float yFactor = (imageYScale / 72) * desiredResolution / imageHeight;
log.info("Scaling x by factor " + xFactor + " (desired resolution is " + desiredResolution + ")");
int dWidth = (int) (xFactor * imageWidth);
int dHeight = (int) (yFactor * imageHeight);
// the image type is from
// https://docs.oracle.com/javase/6/docs/api/constant-values.html#java.awt.image.
log.debug(String.format("Destination: %d x %d [%s]", dWidth, dHeight, bImage.getType()));
BufferedImage scaledImage = new BufferedImage(dWidth, dHeight, bImage.getType());
Graphics2D graphics2D = scaledImage.createGraphics();
graphics2D.setRenderingHint(RenderingHints.KEY_INTERPOLATION, RenderingHints.VALUE_INTERPOLATION_BILINEAR);
graphics2D.setRenderingHint(RenderingHints.KEY_RENDERING, RenderingHints.VALUE_RENDER_QUALITY);
graphics2D.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
graphics2D.drawImage(bImage, 0, 0, dWidth, dHeight, null);
graphics2D.dispose();
if (log.isDebugEnabled()) {
log.debug("Saving scaled img to \"scaled-"+myCounter+".png\"");
File outputfile = new File("scaled-"+myCounter+".png");
ImageIO.write(scaledImage, "png", outputfile);
}
return scaledImage;
}
return null;
}
/**
* @return the hasInlineImages
*/
public boolean isHasInlineImages() {
return hasInlineImages;
}
/**
* @param hasInlineImages the hasInlineImages to set
*/
public void setHasInlineImages(boolean hasInlineImages) {
this.hasInlineImages = hasInlineImages;
}
/**
* @return the tms
*/
public LinkedBlockingQueue<Matrix> getTms() {
return tms;
}
/**
* @param tms the tms to set
*/
public void setTms(LinkedBlockingQueue<Matrix> tms) {
this.tms = tms;
}
}