需要用其他语言替换pdf中的文本。第一步,我试图使用itextpdf ad pdfbox API搜索和替换pdf文件中的文本。
使用下面的代码片段,该片段使用itextpdf api从源PDF文件中搜索文本并将“ Hello”替换为“ Hi”。创建新PDF时不会替换任何文本。
public void manipulatePdf(String src, String dest) throws Exception {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(DEST));
int noOfPages = pdfDoc.getNumberOfPages();
for (int i = 1; i < noOfPages; i++) {
PdfPage page = pdfDoc.getPage(i);
PdfDictionary dict = page.getPdfObject();
PdfObject object = dict.get(PdfName.Contents);
if (object instanceof PdfStream) {
PdfStream stream = (PdfStream) object;
byte[] data = stream.getBytes();
stream.setData(new String(data).replace("Hello", "Hi").getBytes("UTF-8"));
}
}
pdfDoc.close();
}
还使用apache pdfbox实现了相同的功能,但是没有运气。以下是供参考的代码段。
public static PDDocument replaceText(PDDocument document, String searchString, String replacement)
throws IOException {
for (PDPage page : document.getPages()) {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
// Tj and TJ are the two operators that display strings in a PDF
if (op.getName().equals("Tj")) {
// Tj takes one operator and that is the string to display
// so lets update that operator
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
//System.out.println(new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
string = string.replaceFirst(searchString, replacement);
previous.setValue(string.getBytes());
} else if (op.getName().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
//System.out.println("22::"+new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
string = StringUtils.replaceOnce(string, searchString, replacement);
cosString.setValue(string.getBytes());
}
}
}
}
}
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
// save content
page.setContents(updatedStream);
out.close();
}
高度赞赏任何解决方案/建议。
答案 0 :(得分:0)
这是一个工作版本,使用PDFBox
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
public final class PDFEditor {
private PDFEditor() {
}
public static void main(String[] args) throws IOException {
PDDocument document = null;
document = PDDocument.load(new File("src path"));
document = replaceText(document, "Hello", "Hi");
document.save("target Path");
document.close();
}
private static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
if (StringUtils.isEmpty(searchString) || StringUtils.isEmpty(replacement)) {
return document;
}
for (PDPage page : document.getPages()) {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<?> tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
String pstring = "";
int prej = 0;
if (op.getName().equals("Tj")) {
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
string = string.replaceFirst(searchString, replacement);
previous.setValue(string.getBytes());
} else if (op.getName().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
if (j == prej) {
pstring += string;
} else {
prej = j;
pstring = string;
}
}
}
if (searchString.equals(pstring.trim())) {
COSString cosString2 = (COSString) previous.getObject(0);
cosString2.setValue(replacement.getBytes());
int total = previous.size() - 1;
for (int k = total; k > 0; k--) {
previous.remove(k);
}
}
}
}
}
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
out.close();
page.setContents(updatedStream);
}
return document;
}
}
依赖项:
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.0.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.11</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.0</version>
</dependency>
答案 1 :(得分:-2)
请使用下面的工作,使用任何
`
package com.pdf;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Date;
public class ItextTestMain {
public static final String SRC = "C:\\JavaApp\\Pdf\\input\\inputFile.pdf";
public static final String DEST = "C:\\JavaApp\\Pdf\\output\\outPutFile"+new Date().getTime() +".pdf";
public static void main(String[] args) throws IOException, DocumentException {
File file = new File(DEST);
file.getParentFile().mkdirs();
processPDF(SRC, DEST);
}
public static void processPDF(String src, String dest) throws IOException, DocumentException {
PdfReader reader = new PdfReader(src);
int pNumbers = reader.getNumberOfPages();
PRStream stream;
for (int i= 1 ; i <= pNumbers;i++){
PdfDictionary dict = reader.getPageN(i);
PdfObject object = dict.getDirectObject(PdfName.CONTENTS);
if (object instanceof PRStream) {
stream = (PRStream) object;
byte[] data = PdfReader.getStreamBytes(stream);
String dd = new String(data);
dd = dd.replaceAll("old_text", "old_text");
stream.setData(dd.getBytes());
}
}
PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(dest));
stamper.close();
reader.close();
}
}
------------------------------------------------
Maven dependencies to add POM.xml
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.0.6</version>
</dependency>`