'问题是我无法阅读自己的pdf文件。'
我正在测试pdfbox java工具箱以编辑pdf文件。 我制作了一些pdf文件,而且pdfbox似乎都有所不同。 SO与pdfbox我逐页打开一个文件,我读取文本(如果是Tj或TJ)并在需要时替换它。 我有一个带文字的模板:"这个文件是由[用户名]" 使用我的程序,我可以用我的数据库中的用户替换[用户名]。
现在我已经从谷歌文档制作pdf文件(下载为pdf),在pdfbox中创建了emtpy行,其内容如下:
//this is an op with Tj
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
我使用libre office(在ubuntu上)创建了一个pdf,现在它是一个需要这段代码的TJ:
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
}
}
这会给我一些奇怪的事。
当我使用windowx docx文档(我在某个地方的电子邮件中找到)并使用在线转换器转换它时,它也有TJ元素并且工作得很好(我看到文本)。
我的问题是:不同的pdf之间有什么区别(pdf版本1.3 / 1.4,字体,编码,更多?)。更重要的是pdfbox能理解什么,或者我如何阅读其他文件?有一些元数据吗?我可以在pdfbox中设置encoding / versin / font类型吗?
感谢,
tibi
ps这里我的完整(正在进行的工作不是那么好的代码):
package nl.tibi.pdfboxhelper;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDMetadata;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
import org.apache.pdfbox.util.PDFOperator;
import org.apache.pdfbox.util.PDFTextStripper;
/**
* Hello world!
*/
public class PdfStringReplacer {
/**
* Locate a string in a PDF and replace it with a new string.
*
* @param inputFile
* The PDF to open.
* @param outputFile
* The PDF to write to.
* @param strToFind
* The string to find in the PDF document.
* @param message
* The message to write in the file.
* @throws IOException
* If there is an error writing the data.
* @throws COSVisitorException
* If there is an error writing the PDF.
*/
public void replaceString(String inputFile, String outputFile, String strToFind, String message) throws IOException, COSVisitorException {
// the document
PDDocument doc = null;
try {
doc = PDDocument.load(inputFile);
PDFTextStripper textStripper = new PDFTextStripper();
System.out.println(textStripper.getText(doc));
PDGraphicsState graphicsState = null;
PDDocumentInformation info = doc.getDocumentInformation();
System.out.println("Page Count=" + doc.getNumberOfPages());
System.out.println("Title=" + info.getTitle());
System.out.println("Author=" + info.getAuthor());
System.out.println("Subject=" + info.getSubject());
System.out.println("Keywords=" + info.getKeywords());
System.out.println("Creator=" + info.getCreator());
System.out.println("Producer=" + info.getProducer());
System.out.println("Creation Date=" + info.getCreationDate());
System.out.println("Modification Date=" + info.getModificationDate());
System.out.println("Trapped=" + info.getTrapped());
System.out.println("isNeedToBeUpdate=" + info.getMetadataKeys());
PDDocumentCatalog catalog = doc.getDocumentCatalog();
PDMetadata metadata = catalog.getMetadata();
List pages = doc.getDocumentCatalog().getAllPages();
for (int i = 0; i < pages.size(); i++) {
PDPage page = (PDPage) pages.get(i);
graphicsState = new PDGraphicsState(page.findCropBox());
PDFont font = graphicsState.getTextState().getFont();
if (font == null) {
font = new PDType1Font();
}
PDStream contents = page.getContents();
PDFStreamParser parser = new PDFStreamParser(contents.getStream());
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof PDFOperator) {
PDFOperator op = (PDFOperator) next;
// Tj and TJ are the two operators that display
// strings in a PDF
if (op.getOperation().equals("Tj")) {
// Tj takes one operator and that is the string
// to display so lets update that operator
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString(); // new String(previous.getBytes(), "ISO-8859-1");
string = string.replaceFirst(strToFind, message);
previous.reset();
System.out.println("p: " + string + " <=> " + previous.getHexString() + " <->" + new String(previous.getBytes(), "UTF-16BE")
+ " <->" + new String(previous.getBytes(), "ISO-8859-1"));
previous.append(string.getBytes("ISO-8859-1"));// "test".getBytes());
} else if (op.getOperation().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
String c = font.encode(cosString.getBytes(), 0, 1);
System.out.println(c + " " + string + " <=> " + cosString.getHexString() + " <->"
+ new String(cosString.getBytes(), "UTF-16BE") + " <->" + new String(cosString.getBytes(), "ISO-8859-1"));
string = string.replaceFirst(strToFind, message);
cosString.reset();
cosString.append(string.getBytes("ISO-8859-1"));
}
}
}
}
}
// now that the tokens are updated we will replace the
// page content stream.
PDStream updatedStream = new PDStream(doc);
OutputStream out = updatedStream.createOutputStream();
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
page.setContents(updatedStream);
}
doc.save(outputFile);
} finally {
if (doc != null) {
doc.close();
}
}
}
}