How do I read font information when reading text from pdf using itext?

时间:2019-04-17 01:37:23

标签: c# pdf itext

I have multiple separate pdfs and I want to read the text from them 1 by 1, and write it to a new pdf so they are all in the same document. I dont want to use PdfMerger because that will include all the white space. This is why I want to read and copy the actual text.

The main problem though is the font. The below code is fine for reading and writing all the text, but I lost the font size/type and whether or not its in bold. I need this to format to my destination page. Does anyone know how to do this?

Thanks.

 byte[] result;                     

        using (var ms = new MemoryStream())
        {
            var writer = new PdfWriter(ms);                
            PdfDocument outPdf = new PdfDocument(writer);
            //PdfMerger merger = new PdfMerger(outPdf);
            Document outDocument = new Document(outPdf);

            foreach (var clause in clauses)
            {
                //Add pages from the first document
                var sourceReader = new PdfReader(new MemoryStream(clause.ClauseBytes));
                var sourcePdf = new PdfDocument(sourceReader);

                for (int i = 0; i < sourcePdf.GetNumberOfPages(); i++)
                {
                    var sourcePage = sourcePdf.GetPage(i+1);
                    var strategy = new SimpleTextExtractionStrategy();                        
                    var text = PdfTextExtractor.GetTextFromPage(sourcePage, strategy);
                    var currentText = Encoding.
                        UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)));

                    outDocument.Add(new Paragraph(currentText));                                              

                    //merger.Merge(sourcePdf, 1, sourcePdf.GetNumberOfPages());
                    //sourcePdf.Close();
                }
            }
            outDocument.Close();
            //merger.Close();                
            result = ms.GetBuffer();
        }

        return result;

0 个答案:

没有答案