我正在研究pdfreader。但我想区分一个真正的新线或只是一个段落(由缺少的空间引起)。
问题是即使新行属于它添加\n
的段落。
这是我已经尝试过的一些代码。
public string GetContent(int page = 1)
{
using (var pdfReader = new PdfReader(Path))
{
ITextExtractionStrategy strategy = new LocationTextExtractionStrategy();
//ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
//iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(0, 0, 612, 792);
//RenderFilter[] renderFilter = new RenderFilter[1];
//renderFilter[0] = new RegionTextRenderFilter(rect);
//ITextExtractionStrategy textExtractionStrategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), renderFilter);
var currentText = PdfTextExtractor.GetTextFromPage(pdfReader, 1, strategy);
currentText =
Encoding.UTF8.GetString(Encoding.Convert(
Encoding.Default,
Encoding.UTF8,
Encoding.Default.GetBytes(currentText)));
return currentText;
}
}