使用java中的itext将pdf文件转换为word文档

时间:2013-12-12 13:55:57

标签: java pdf itext

感谢您的回复。文字即将到来但不考虑任何(空格,字体大小,侧标题,正文)。我想从pdf中读取数据中确切存在的数据。任何帮助感谢。我的代码在下面给出

到目前为止,这是我的代码:

package bis.proj.samp;

import java.io.File;
import java.io.FileOutputStream;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;
import com.lowagie.text.Document;
import com.lowagie.text.Paragraph;
import com.lowagie.text.rtf.RtfWriter2;

public class ReadPdfFile {

public static void main(String[] args) {
    try {

        Document document = new Document();

        File  file = new File("/home/mujafar/Desktop/file.doc");
        if(!file.exists())
            file.createNewFile();

        RtfWriter2.getInstance(document, new FileOutputStream("/home/mujafar/Desktop/file.doc"));
        System.out.println("file created");
        document.open();

    PdfReader reader = new PdfReader("/home/mujafar/Desktop/NPTEL Transcription Guidelines.pdf");
    int n = reader.getNumberOfPages();
    System.out.println("total no of pages:::"+n);
    String s="";
    for(int i=1;i<=n;i++)
    {

        s=PdfTextExtractor.getTextFromPage(reader, i);


        System.out.println("string:::"+s);
        System.out.println("====================");

        document.add(new Paragraph(s));
        document.newPage();
    }
    document.close();
    System.out.println("completed");
    } catch (Exception de) {}
    }

}

1 个答案:

答案 0 :(得分:0)

使用RenderListener可能更好,如下所示:

PdfReader reader = new PdfReader(inputStream);
StringBuilder documentText = new StringBuilder();
RenderListener listener = new RenderListener()
{
    @Override
    public void renderText(TextRenderInfo arg0)
    {
        LineSegment segment = arg0.getBaseline();
        int x = (int)segment.getStartPoint().get(Vector.I1);
        int y = (int)segment.getStartPoint().get(Vector.I2);
        b.append("at "+x+"/"+y+": "+arg0.arg0.getText());
        b.append("\n");
    }

    @Override
    public void renderImage(ImageRenderInfo arg0)
    {
        // TODO
    }

    @Override
    public void endTextBlock()
    {
    }

    @Override
    public void beginTextBlock()
    {
    }
};

PdfReaderContentParser p = new PdfReaderContentParser(reader);
for (int i = 1; i <= reader.getNumberOfPages(); i++)
{
    p.processContent(i, listener);
    b.append("\n\n(page break)\n\n");
}

// b now contains the accumulated text from your PDF.
// You could use Apache Poi to construct a DOCX with it.
System.out.println(b.toString());

请注意,renderText调用不一定按照它们在屏幕/纸张上显示的顺序,您可能需要根据各种arg0对象中的X / Y坐标“重建”页面布局。