如何在C#中使用iText在PDF中查找文本基数词?

时间:2018-09-20 09:48:01

标签: c# itext

我想在包含给定模式的pdf中找到一行(或段落)的基本位置。

例如,我可能遇到这个问题:

  • 在输入中,我有一个正则表达式(例如“ Test。*”)和一个包含一行(或一段)的PDF,该行对该正则表达式有效。
  • 我想要作为输出:验证此正则表达式的行的Y位置列表。

有人知道我如何检测到这些位置吗?

非常感谢您。

艾略特

1 个答案:

答案 0 :(得分:0)

我可以为您提供一些帮助,但尚未完全完成。我曾经写,但没有完成。您将能够确定文本的位置。程序以pdf格式返回每个项目并返回坐标。

我使用-itext7和dotnet核心

string [] srcFileNames = {“ 1.pdf”}; FindTextInPdf(“ test”,srcFileNames);

 public void FindTextInPdf(string SearchStr, string[] sources)
 {

            foreach (var item in sources)
            {
                if (File.Exists(item))
                {
                    using (PdfReader reader = new PdfReader(item))
                    using (var doc = new PdfDocument(reader))
                    {

                        var pageCount = doc.GetNumberOfPages();

                        for (int i = 1; i <= pageCount; i++)
                        {
                            PdfPage page = doc.GetPage(i);
                            var box = page.GetCropBox();
                            var rect = new Rectangle(box.GetX(), box.GetY(), box.GetWidth(), box.GetHeight());

                            var filter = new IEventFilter[1];
                                filter[0] = new TextRegionEventFilter(rect);

                            ITextExtractionStrategy strategy = new FilteredTextEventListener(new TextLocationStrategy(), filter);
                            var str = PdfTextExtractor.GetTextFromPage(page, strategy);
                            if (str.Contains(SearchStr) == true)
                            {
                                Console.WriteLine("Searched text found in file:[ " + item + " ] page : [ " + i + " ]");
                            }

                            foreach (var d in objectResult)
                            {
                                Console.WriteLine("Char >"+ d.Text+ " X >"+ d.Rect.GetX()+" font >"+ d.FontFamily + " font size >"+ d.FontSize.ToString()+" space >"+ d.SpaceWidth);**

                            }


                        }
                    }
                }



    }


class TextLocationStrategy : LocationTextExtractionStrategy
{
    public static List<TextMyChunk> objectResult = new List<TextMyChunk>();

    public class TextMyChunk
    {
        public string Text { get; set; }
        public Rectangle Rect { get; set; }
        public string FontFamily { get; set; }
        public float FontSize { get; set; }
        public float SpaceWidth { get; set; }

    }

    public override void EventOccurred(IEventData data, EventType type)
    {
        if (!type.Equals(EventType.RENDER_TEXT)) return;

        TextRenderInfo renderInfo = (TextRenderInfo)data;

        IList<TextRenderInfo> text = renderInfo.GetCharacterRenderInfos();
        foreach (TextRenderInfo t in text)
        {
            string letter = t.GetText();
            Vector letterStart = t.GetBaseline().GetStartPoint();
            Vector letterEnd = t.GetAscentLine().GetEndPoint();
            Rectangle letterRect = new Rectangle(letterStart.Get(0), letterStart.Get(1), letterEnd.Get(0) - letterStart.Get(0), letterEnd.Get(1) - letterStart.Get(1));

                TextMyChunk chunk = new TextMyChunk();
                chunk.Text = letter;
                chunk.Rect = letterRect;
                chunk.FontFamily = t.GetFont().GetFontProgram().ToString();
                chunk.FontSize = t.GetFontSize();
                chunk.SpaceWidth = t.GetSingleSpaceWidth();

                objectResult.Add(chunk);

        }

    }