我需要帮助从PDF文件中提取文本和图像,并使用Java库iText5对提取的文本中的图像进行映射或引用。如果iText5是错误的工具,请通过推荐另一个具有相同功能的Java库来告诉我。
到目前为止,我已经做了
import java.io.IOException;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.itextpdf.text.Paragraph;
public class Iconverter {
/**
* @param args
*/
static int PAGE_NUMBER;
/** The new document to which we've added a border rectangle. */
public static final String RESULT = "/home/sarah/Java for Dummies 4th Edition/Img%s.%s";
public static void main(String[] args) {
String docText = "";
String pdfName = "/home/sarah/Java for Dummies 4th Edition.pdf";
Document document = new Document();
document.open();
try {
PdfReader reader = new PdfReader(pdfName);
PAGE_NUMBER = reader.getNumberOfPages();
for(int i = 1; i <=PAGE_NUMBER; i++){
docText = PdfTextExtractor.getTextFromPage(reader, i);
}
extractImages(pdfName);
document.add(new Paragraph(".."));
} catch (Exception e) {
e.printStackTrace();
}
document.close();
}
/**
* Parses a PDF and extracts all the images.
* @param src the source PDF
* @param dest the resulting PDF
*/
public static void extractImages(String filename) throws IOException, DocumentException {
PdfReader reader = new PdfReader(filename);
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
MyImageRenderListener listener = new MyImageRenderListener(RESULT);
for (int i = 1; i <= PAGE_NUMBER; i++) {
parser.processContent(i, listener);
}
}
}
import java.awt.image.BufferedImage;
import java.io.FileOutputStream;
import java.io.IOException;
import javax.imageio.ImageIO;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.PdfImageObject;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
public class MyImageRenderListener implements RenderListener {
/** The new document to which we've added a border rectangle. */
protected String path = "";
/**
* Creates a RenderListener that will look for images.
*/
public MyImageRenderListener(String path) {
this.path = path;
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock()
*/
public void beginTextBlock() {
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock()
*/
public void endTextBlock() {
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(
* com.itextpdf.text.pdf.parser.ImageRenderInfo)
*/
public void renderImage(ImageRenderInfo renderInfo) {
try {
String filename;
FileOutputStream os;
PdfImageObject image = renderInfo.getImage();
PdfName filter = (PdfName)image.get(PdfName.FILTER);
if (PdfName.DCTDECODE.equals(filter)) {
filename = String.format(path, renderInfo.getRef().getNumber(), "JPG");
os = new FileOutputStream(filename);
os.write(image.getStreamBytes());
os.flush();
os.close();
}
else if (PdfName.JPXDECODE.equals(filter)) {
filename = String.format(path, renderInfo.getRef().getNumber(), "jp2");
os = new FileOutputStream(filename);
os.write(image.getStreamBytes());
os.flush();
os.close();
}
else if (PdfName.JBIG2DECODE.equals(filter)) {
// ignore: filter not supported.
}
else {
BufferedImage awtimage = renderInfo.getImage().getBufferedImage();
if (awtimage != null) {
filename = String.format(path, renderInfo.getRef().getNumber(), "png");
ImageIO.write(awtimage, "png", new FileOutputStream(filename));
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* @see com.itextpdf.text.pdf.parser.RenderListener#renderText(
* com.itextpdf.text.pdf.parser.TextRenderInfo)
*/
public void renderText(TextRenderInfo renderInfo) {
}
}