我目前正在努力让pdfbox工作以从pdfs中提取所有图像。我使用的是pdfbox.1.8.7 jar文件,下面是我的代码:
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
public class PdfBoxExtractor {
public static void main(String[] args) {
PdfBoxExtractor obj = new PdfBoxExtractor();
try {
obj.read_pdf();
} catch (IOException ex) {
System.out.println("" + ex);
}
}
void read_pdf() throws IOException {
PDDocument document = null;
try {
document = PDDocument
.load("C:\\Testpdf\\pdfsample\\15802643.pdf");
//Linux path("/home/biadmin/freddy/15802643.pdf")
} catch (IOException ex) {
System.out.println("" + ex);
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
int i = 1;
String name = null;
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
Map pageImages = resources.getImages();
if (pageImages != null) {
Iterator imageIter = pageImages.keySet().iterator();
while (imageIter.hasNext()) {
String key = (String) imageIter.next();
PDXObjectImage image = (PDXObjectImage) pageImages.get(key);
image.write2file("C:\\processed\\pdf\\image" + i);
//Linux path (/home/biadmin/processed/image"+i)
i++;
}
}
}
}
}
我将相同的代码移植到redhat 6.5环境,更改此代码中的源/接收路径并尝试运行它。但它会带来很多错误。我目前一无所知,为什么同样的代码会在linux中失败。 两者中的JRE是Oracle - JRE 1.7 请查看我在Linux中遇到的错误。
Dec 7, 2014 4:52:44 AM org.apache.pdfbox.cos.COSDocument finalize
WARNING: Warning: You did not close a PDF Document
Dec 7, 2014 4:52:44 AM org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap getRGBImage
SEVERE: java.lang.IndexOutOfBoundsException: Index: 2, Size: 0
Throwable occurred: java.lang.IndexOutOfBoundsException: Index: 2, Size: 0
at java.util.ArrayList.get(ArrayList.java:352)
at org.apache.pdfbox.io.RandomAccessBuffer.seek(RandomAccessBuffer.java:110)
at org.apache.pdfbox.io.RandomAccessFileOutputStream.write(RandomAccessFileOutputStream.java:116)
at org.apache.pdfbox.filter.RunLengthDecodeFilter.decode(RunLengthDecodeFilter.java:92)
at org.apache.pdfbox.cos.COSStream.doDecode(COSStream.java:318)
at org.apache.pdfbox.cos.COSStream.doDecode(COSStream.java:266)
at org.apache.pdfbox.cos.COSStream.getUnfilteredStream(COSStream.java:192)
at org.apache.pdfbox.pdmodel.common.PDStream.createInputStream(PDStream.java:232)
at org.apache.pdfbox.pdmodel.common.PDStream.getByteArray(PDStream.java:510)
at org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap.getRGBImage(PDPixelMap.java:255)
at org.apache.pdfbox.pdmodel.graphics.xobject.PDPixelMap.write2OutputStream(PDPixelMap.java:386)
at org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage.write2file(PDXObjectImage.java:262)
at com.vinayak.PdfBoxExtractor3.read_pdf(PdfBoxExtractor3.java:45)
at com.vinayak.PdfBoxExtractor3.main(PdfBoxExtractor3.java:16)
Exception in thread "main" java.lang.NullPointerException
at com.vinayak.PdfBoxExtractor3.read_pdf(PdfBoxExtractor3.java:39)
at com.vinayak.PdfBoxExtractor3.main(PdfBoxExtractor3.java:16)
由于