Question

＆＃39;问题是我无法阅读自己的pdf文件。＆＃39;

我正在测试pdfbox java工具箱以编辑pdf文件。我制作了一些pdf文件，而且pdfbox似乎都有所不同。 SO与pdfbox我逐页打开一个文件，我读取文本（如果是Tj或TJ）并在需要时替换它。我有一个带文字的模板：＆＃34;这个文件是由[用户名]＆＃34; 使用我的程序，我可以用我的数据库中的用户替换[用户名]。

现在我已经从谷歌文档制作pdf文件（下载为pdf），在pdfbox中创建了emtpy行，其内容如下：

//this is an op with Tj
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();

我使用libre office（在ubuntu上）创建了一个pdf，现在它是一个需要这段代码的TJ：

COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
    Object arrElement = previous.getObject(k);
    if (arrElement instanceof COSString) {
        COSString cosString = (COSString) arrElement;
        String string = cosString.getString();
    }
}

这会给我一些奇怪的事。

当我使用windowx docx文档（我在某个地方的电子邮件中找到）并使用在线转换器转换它时，它也有TJ元素并且工作得很好（我看到文本）。

我的问题是：不同的pdf之间有什么区别（pdf版本1.3 / 1.4，字体，编码，更多？）。更重要的是pdfbox能理解什么，或者我如何阅读其他文件？有一些元数据吗？我可以在pdfbox中设置encoding / versin / font类型吗？

感谢，

tibi

ps这里我的完整（正在进行的工作不是那么好的代码）：

package nl.tibi.pdfboxhelper;

import java.io.IOException;
import java.io.OutputStream;
import java.util.List;

import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 * Hello world!
 */
public class PdfStringReplacer {

    /**
     * Locate a string in a PDF and replace it with a new string.
     * 
     * @param inputFile
     *            The PDF to open.
     * @param outputFile
     *            The PDF to write to.
     * @param strToFind
     *            The string to find in the PDF document.
     * @param message
     *            The message to write in the file.
     * @throws IOException
     *             If there is an error writing the data.
     * @throws COSVisitorException
     *             If there is an error writing the PDF.
     */
    public void replaceString(String inputFile, String outputFile, String strToFind, String message) throws IOException, COSVisitorException {
        // the document
        PDDocument doc = null;
        try {
            doc = PDDocument.load(inputFile);
            PDFTextStripper textStripper = new PDFTextStripper();
            System.out.println(textStripper.getText(doc));
            PDGraphicsState graphicsState = null;

            PDDocumentInformation info = doc.getDocumentInformation();
            System.out.println("Page Count=" + doc.getNumberOfPages());
            System.out.println("Title=" + info.getTitle());
            System.out.println("Author=" + info.getAuthor());
            System.out.println("Subject=" + info.getSubject());
            System.out.println("Keywords=" + info.getKeywords());
            System.out.println("Creator=" + info.getCreator());
            System.out.println("Producer=" + info.getProducer());
            System.out.println("Creation Date=" + info.getCreationDate());
            System.out.println("Modification Date=" + info.getModificationDate());
            System.out.println("Trapped=" + info.getTrapped());
            System.out.println("isNeedToBeUpdate=" + info.getMetadataKeys());
            PDDocumentCatalog catalog = doc.getDocumentCatalog();
            PDMetadata metadata = catalog.getMetadata();
            List pages = doc.getDocumentCatalog().getAllPages();
            for (int i = 0; i < pages.size(); i++) {
                PDPage page = (PDPage) pages.get(i);
                graphicsState = new PDGraphicsState(page.findCropBox());
                PDFont font = graphicsState.getTextState().getFont();
                if (font == null) {
                    font = new PDType1Font();
                }
                PDStream contents = page.getContents();
                PDFStreamParser parser = new PDFStreamParser(contents.getStream());
                parser.parse();
                List tokens = parser.getTokens();
                for (int j = 0; j < tokens.size(); j++) {
                    Object next = tokens.get(j);
                    if (next instanceof PDFOperator) {
                        PDFOperator op = (PDFOperator) next;
                        // Tj and TJ are the two operators that display
                        // strings in a PDF
                        if (op.getOperation().equals("Tj")) {
                            // Tj takes one operator and that is the string
                            // to display so lets update that operator
                            COSString previous = (COSString) tokens.get(j - 1);
                            String string = previous.getString(); // new String(previous.getBytes(), "ISO-8859-1");
                            string = string.replaceFirst(strToFind, message);
                            previous.reset();
                            System.out.println("p: " + string + " <=> " + previous.getHexString() + " <->" + new String(previous.getBytes(), "UTF-16BE")
                                    + " <->" + new String(previous.getBytes(), "ISO-8859-1"));
                            previous.append(string.getBytes("ISO-8859-1"));// "test".getBytes());
                        } else if (op.getOperation().equals("TJ")) {
                            COSArray previous = (COSArray) tokens.get(j - 1);
                            for (int k = 0; k < previous.size(); k++) {
                                Object arrElement = previous.getObject(k);
                                if (arrElement instanceof COSString) {
                                    COSString cosString = (COSString) arrElement;
                                    String string = cosString.getString();
                                    String c = font.encode(cosString.getBytes(), 0, 1);
                                    System.out.println(c + " " + string + " <=> " + cosString.getHexString() + " <->"
                                            + new String(cosString.getBytes(), "UTF-16BE") + " <->" + new String(cosString.getBytes(), "ISO-8859-1"));
                                    string = string.replaceFirst(strToFind, message);
                                    cosString.reset();
                                    cosString.append(string.getBytes("ISO-8859-1"));
                                }
                            }
                        }
                    }
                }
                // now that the tokens are updated we will replace the
                // page content stream.
                PDStream updatedStream = new PDStream(doc);
                OutputStream out = updatedStream.createOutputStream();
                ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
                tokenWriter.writeTokens(tokens);
                page.setContents(updatedStream);
            }
            doc.save(outputFile);
        } finally {
            if (doc != null) {
                doc.close();
            }
        }
    }
}

pfdbox不同的pdf版本，编码，字体类型

0 个答案: