使用JAVA搜索和替换PDF中的文本

时间:2018-08-26 15:32:08

标签: java itext pdfbox

需要用其他语言替换pdf中的文本。第一步,我试图使用itextpdf ad pdfbox API搜索和替换pdf文件中的文本。

使用下面的代码片段,该片段使用itextpdf api从源PDF文件中搜索文本并将“ Hello”替换为“ Hi”。创建新PDF时不会替换任何文本。

public void manipulatePdf(String src, String dest) throws Exception {
    PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(DEST));
    int noOfPages = pdfDoc.getNumberOfPages();
    for (int i = 1; i < noOfPages; i++) {
        PdfPage page = pdfDoc.getPage(i);
        PdfDictionary dict = page.getPdfObject();
        PdfObject object = dict.get(PdfName.Contents);
        if (object instanceof PdfStream) {
            PdfStream stream = (PdfStream) object;
            byte[] data = stream.getBytes();
            stream.setData(new String(data).replace("Hello", "Hi").getBytes("UTF-8"));
        }
    }
    pdfDoc.close();
}

还使用apache pdfbox实现了相同的功能,但是没有运气。以下是供参考的代码段。

    public static PDDocument replaceText(PDDocument document, String searchString, String replacement)
        throws IOException {        
    for (PDPage page : document.getPages()) {
        PDFStreamParser parser = new PDFStreamParser(page);
        parser.parse();
        List tokens = parser.getTokens();
        for (int j = 0; j < tokens.size(); j++) {
            Object next = tokens.get(j);
            if (next instanceof Operator) {
                Operator op = (Operator) next;
                // Tj and TJ are the two operators that display strings in a PDF
                if (op.getName().equals("Tj")) {
                    // Tj takes one operator and that is the string to display
                    // so lets update that operator
                    COSString previous = (COSString) tokens.get(j - 1);
                    String string = previous.getString();
                    //System.out.println(new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
                    string = string.replaceFirst(searchString, replacement);
                    previous.setValue(string.getBytes());

                } else if (op.getName().equals("TJ")) {
                    COSArray previous = (COSArray) tokens.get(j - 1);
                    for (int k = 0; k < previous.size(); k++) {
                        Object arrElement = previous.getObject(k);
                        if (arrElement instanceof COSString) {
                            COSString cosString = (COSString) arrElement;
                            String string = cosString.getString();
                            //System.out.println("22::"+new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
                            string = StringUtils.replaceOnce(string, searchString, replacement);
                            cosString.setValue(string.getBytes());
                        }
                    }
                }
            }
        }

        PDStream updatedStream = new PDStream(document);
        OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
        ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
        tokenWriter.writeTokens(tokens);
         // save content
        page.setContents(updatedStream);
        out.close();
    }

高度赞赏任何解决方案/建议。

2 个答案:

答案 0 :(得分:0)

这是一个工作版本,使用PDFBox

import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;

public final class PDFEditor {

    private PDFEditor() {
    }

    public static void main(String[] args) throws IOException {
        PDDocument document = null;
        document = PDDocument.load(new File("src path"));
        document = replaceText(document, "Hello", "Hi");
        document.save("target Path");
        document.close();
    }

    private static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
        if (StringUtils.isEmpty(searchString) || StringUtils.isEmpty(replacement)) {
            return document;
        }

        for (PDPage page : document.getPages()) {
            PDFStreamParser parser = new PDFStreamParser(page);
            parser.parse();
            List<?> tokens = parser.getTokens();

            for (int j = 0; j < tokens.size(); j++) {
                Object next = tokens.get(j);
                if (next instanceof Operator) {
                    Operator op = (Operator) next;

                    String pstring = "";
                    int prej = 0;

                    if (op.getName().equals("Tj")) {
                        COSString previous = (COSString) tokens.get(j - 1);
                        String string = previous.getString();
                        string = string.replaceFirst(searchString, replacement);
                        previous.setValue(string.getBytes());
                    } else if (op.getName().equals("TJ")) {
                        COSArray previous = (COSArray) tokens.get(j - 1);
                        for (int k = 0; k < previous.size(); k++) {
                            Object arrElement = previous.getObject(k);
                            if (arrElement instanceof COSString) {
                                COSString cosString = (COSString) arrElement;
                                String string = cosString.getString();

                                if (j == prej) {
                                    pstring += string;
                                } else {
                                    prej = j;
                                    pstring = string;
                                }
                            }
                        }

                        if (searchString.equals(pstring.trim())) {
                            COSString cosString2 = (COSString) previous.getObject(0);
                            cosString2.setValue(replacement.getBytes());

                            int total = previous.size() - 1;
                            for (int k = total; k > 0; k--) {
                                previous.remove(k);
                            }
                        }
                    }
                }
            }
            PDStream updatedStream = new PDStream(document);
            OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
            ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
            tokenWriter.writeTokens(tokens);
            out.close();
            page.setContents(updatedStream);
        }

        return document;
    }

}

依赖项:

<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>5.0.6</version>
</dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.11</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-lang3</artifactId>
    <version>3.0</version>
</dependency>

答案 1 :(得分:-2)

请使用下面的工作,使用任何

`

package com.pdf;

import com.itextpdf.text.DocumentException;
import com.itextpdf.text.pdf.PRStream;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfObject;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.PdfStamper;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Date;

public class ItextTestMain {


    public static final String SRC = "C:\\JavaApp\\Pdf\\input\\inputFile.pdf";
    public static final String DEST = "C:\\JavaApp\\Pdf\\output\\outPutFile"+new Date().getTime() +".pdf";

    public static void main(String[] args) throws IOException, DocumentException {
        File file = new File(DEST);
        file.getParentFile().mkdirs();
        processPDF(SRC, DEST);
    }

    public static void processPDF(String src, String dest) throws IOException, DocumentException {
        PdfReader reader = new PdfReader(src);
        int pNumbers = reader.getNumberOfPages();
        PRStream stream;
        for (int i= 1 ; i <= pNumbers;i++){
            PdfDictionary  dict = reader.getPageN(i);
            PdfObject  object = dict.getDirectObject(PdfName.CONTENTS);
            if (object instanceof PRStream) {
                  stream = (PRStream) object;
                byte[] data = PdfReader.getStreamBytes(stream);
                String dd = new String(data);
                dd = dd.replaceAll("old_text", "old_text");
                stream.setData(dd.getBytes());
            }
        }

        PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(dest));
        stamper.close();
        reader.close();
    }

}

------------------------------------------------

Maven dependencies to add  POM.xml

<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>5.0.6</version>
</dependency>`