Question

我正在使用 Apache poi 库使用以下java代码将 doc文件转换为pdf ：

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

import com.lowagie.text.Document;
import com.lowagie.text.Paragraph;
import com.lowagie.text.pdf.PdfWriter;

public class TestDoc {

    /**
     * @param args
     */
    public static void main(String[] args) {
        // TODO Auto-generated method stub
        POIFSFileSystem fs = null;
        Document document = new Document();
        try {
            System.out.println("Starting the test");

            //D:\vijay\doctopdf
            fs = new POIFSFileSystem(new FileInputStream("D:/vijay/doctopdf/test.doc"));

            HWPFDocument doc = new HWPFDocument(fs);
            WordExtractor we = new WordExtractor(doc);

            OutputStream file = new FileOutputStream(new File("D:/vijay/doctopdf/test.pdf"));

            PdfWriter writer = PdfWriter.getInstance(document, file);

            Range range = doc.getRange();
            document.open();
            writer.setPageEmpty(true);
            document.newPage();
            writer.setPageEmpty(true);

            String[] paragraphs = we.getParagraphText();
            for (int i = 0; i < paragraphs.length; i++) {

                org.apache.poi.hwpf.usermodel.Paragraph pr = range
                        .getParagraph(i);
                // CharacterRun run = pr.getCharacterRun(i);
                // run.setBold(true);
                // run.setCapitalized(true);
                // run.setItalic(true);
                paragraphs[i] = paragraphs[i].replaceAll("\\cM?\r?\n", "");
                System.out.println("Length:" + paragraphs[i].length());
                System.out.println("Paragraph" + i + ": "
                        + paragraphs[i].toString());

                // add the paragraph to the document
                document.add(new Paragraph(paragraphs[i]));
            }

            System.out.println("Document testing completed");
        } catch (Exception e) {
            System.out.println("Exception during test");
            e.printStackTrace();
        } finally {
            // close the document
            document.close();
        }
    }

}

以上代码成功运行（仅转换pdf中的文本）。但是当doc包含表格或图像等等时，它将不会出现在pdf中。任何人都知道如何以完全准确和格式化的方式获得pdf文档。

Answer 1

您可以使用Apache Tika Parser的WordExtractor

Java使用Apache POI库将doc文件转换为pdf，包含所有图形，图像，表格，边框等

1 个答案: