我的代码基于此 https://github.com/Betel-Flowers/BetelFlowers/blob/master/BetelFlowers-ejb/src/main/java/com/betel/flowers/pdf/util/RemoveBlankPageFromPDF.java 或这个 http://www.rgagnon.com/javadetails/java-detect-and-remove-blank-page-in-pdf.html
我正在尝试使用字节数组作为输入,将字节数组作为输出。
这是我的代码
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import com.lowagie.text.Document;
import com.lowagie.text.DocumentException;
import com.lowagie.text.pdf.PdfCopy;
import com.lowagie.text.pdf.PdfDictionary;
import com.lowagie.text.pdf.PdfImportedPage;
import com.lowagie.text.pdf.PdfName;
import com.lowagie.text.pdf.PdfReader;
import com.lowagie.text.pdf.RandomAccessFileOrArray;
public class RemoveBlankPageFromPDF {
// value where we can consider that this is a blank image
// can be much higher or lower depending of what is considered as a blank page
public static final int BLANK_THRESHOLD = 160;
public static byte[] removeBlankPdfPages(byte[] fuente) throws IOException, DocumentException{
PdfReader r = null;
RandomAccessFileOrArray raf = null;
Document document = null;
PdfCopy writer = null;
ByteArrayOutputStream archivoFinal = new ByteArrayOutputStream();
try {
r = new PdfReader(fuente);
raf = new RandomAccessFileOrArray(fuente);
document = new Document(r.getPageSizeWithRotation(1));
writer = new PdfCopy(document,archivoFinal);
document.open();
PdfImportedPage page = null;
for (int i = 1; i <= r.getNumberOfPages(); i++) {
PdfDictionary pageDict = r.getPageN(i);
PdfDictionary resDict = (PdfDictionary) pageDict.get(PdfName.RESOURCES);
boolean noFontsOrImages = true;
if (resDict != null) {
noFontsOrImages = resDict.get(PdfName.FONT) == null
&& resDict.get(PdfName.XOBJECT) == null;
}
if (!noFontsOrImages) {
byte bContent[] = r.getPageContent(i, raf);
ByteArrayOutputStream bs = new ByteArrayOutputStream();
bs.write(bContent);
System.out.println("bs size: " + bs.size());
if (bs.size() > BLANK_THRESHOLD) {
page = writer.getImportedPage(r, i);
writer.addPage(page);
}
}
}
System.out.println("Original: " + fuente.length+ " new: " + archivoFinal.toByteArray().length);
return archivoFinal.toByteArray();
} finally {
if (document != null) {
document.close();
}
if (writer != null) {
writer.close();
}
if (raf != null) {
raf.close();
}
if (r != null) {
r.close();
}
}
}
}
我的pdf被破坏后我无法打开它。
即使没有空格的pdf我也会得到不同的尺寸,它应该是相同的
Original: 95089 New: 88129
这是我的las系统输出。
我顺便使用itext 2.1.5和java 1.5。我无法升级。
答案 0 :(得分:0)
无论如何我找到了答案。以防有人需要旧版本的itext
public static void removeBlankPdfPages(PdfReader r) throws IOException{
PdfTextExtractor extractor = new PdfTextExtractor(r);
List<Integer> paginas = new ArrayList<Integer>();
for (int i = 1; i <= r.getNumberOfPages(); i++) {
PdfDictionary pageDict = r.getPageN(i);
PdfDictionary resDict = (PdfDictionary) pageDict.get(PdfName.RESOURCES);
boolean noFontsOrImages = true;
if (resDict != null) {
noFontsOrImages = resDict.get(PdfName.FONT) == null
&& resDict.get(PdfName.XOBJECT) == null;
}
if (!noFontsOrImages) {
String textFromPage = extractor.getTextFromPage(i);
if(textFromPage.length() >50 ){
paginas.add(i);
}
}
}
r.selectPages(paginas);
}