I have multiple separate pdfs and I want to read the text from them 1 by 1, and write it to a new pdf so they are all in the same document. I dont want to use PdfMerger because that will include all the white space. This is why I want to read and copy the actual text.
The main problem though is the font. The below code is fine for reading and writing all the text, but I lost the font size/type and whether or not its in bold. I need this to format to my destination page. Does anyone know how to do this?
Thanks.
byte[] result;
using (var ms = new MemoryStream())
{
var writer = new PdfWriter(ms);
PdfDocument outPdf = new PdfDocument(writer);
//PdfMerger merger = new PdfMerger(outPdf);
Document outDocument = new Document(outPdf);
foreach (var clause in clauses)
{
//Add pages from the first document
var sourceReader = new PdfReader(new MemoryStream(clause.ClauseBytes));
var sourcePdf = new PdfDocument(sourceReader);
for (int i = 0; i < sourcePdf.GetNumberOfPages(); i++)
{
var sourcePage = sourcePdf.GetPage(i+1);
var strategy = new SimpleTextExtractionStrategy();
var text = PdfTextExtractor.GetTextFromPage(sourcePage, strategy);
var currentText = Encoding.
UTF8.GetString(Encoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(text)));
outDocument.Add(new Paragraph(currentText));
//merger.Merge(sourcePdf, 1, sourcePdf.GetNumberOfPages());
//sourcePdf.Close();
}
}
outDocument.Close();
//merger.Close();
result = ms.GetBuffer();
}
return result;