如何使用itextsharp获取pdf的每个单词的精确坐标?

时间:2017-11-07 16:59:59

标签: c# itext

我已经实现了自己的LocationTextExtractionStrategy。某些pdfs中的renderinfo将单词的组合读取为块,而在其他pdf中则逐字符读取

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using iTextSharp.text.pdf.parser;

namespace PDFAnnotater
{
    public class TestLTES : LocationTextExtractionStrategy
    {
        int WordIndex;
        //Hold each coordinate
        public List<RectAndText> myPoints = new List<RectAndText>();
        public string formedText="";
        public Vector[] charBottomLeft;
        public Vector[] charTopRight;
        public int Counter=0;
        //The string that we're searching for
        public string TextToSearchFor { get; set; }
        public bool found = false;

    //How to compare strings
    public System.Globalization.CompareOptions CompareOptions { get; set; }

    public TestLTES(string textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
    {
        TextToSearchFor = textToSearchFor;
        CompareOptions = compareOptions;
        charBottomLeft = new Vector[1];
        charTopRight = new Vector[1];

    }
    public TestLTES(int index, string textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
    {
        TextToSearchFor = textToSearchFor;
        CompareOptions = compareOptions;
        WordIndex = index;

    }


    //Automatically called for each chunk of text in the PDF
    public override void RenderText(TextRenderInfo renderInfo)
    {
        base.RenderText(renderInfo);
        //See if the current chunk contains the text
        if (renderInfo.GetText().Length >= this.TextToSearchFor.Length)
        {
            if (renderInfo.GetText().Split().Contains(this.TextToSearchFor))
            {
                var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderInfo.GetText(), this.TextToSearchFor, this.CompareOptions);


                //Grab the individual characters
                var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
                var textpos = renderInfo.GetText();
                //.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();


                //Grab the first and last character
                var firstChar = chars.First();
                var lastChar = chars.Last();


                //Get the bounding box for the chunk of text
                var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
                var topRight = lastChar.GetAscentLine().GetEndPoint();

                //Create a rectangle from it
                var rect = new iTextSharp.text.Rectangle(
                                                        bottomLeft[Vector.I1],
                                                        bottomLeft[Vector.I2],
                                                        topRight[Vector.I1],
                                                        topRight[Vector.I2]
                                                        );

                IntegerRectangle TempRect = new IntegerRectangle();
                TempRect.Top = (int)Math.Truncate(rect.Top);
                TempRect.Bottom = (int)Math.Truncate(rect.Bottom);
                TempRect.Left = (int)Math.Truncate(rect.Left);
                TempRect.Right = (int)Math.Truncate(rect.Right);


                //Add this to our main collection
                this.myPoints.Add(new RectAndText(TempRect, rect, this.TextToSearchFor, WordIndex));
            }

            else
            {
                return;
            }
        }
        else
        {
            if (renderInfo.GetText() != "" && renderInfo.GetText()!=" ")
            {
                string[] renderTextArray = renderInfo.GetText().Split();
                for (int i = 0; i < renderTextArray.Length; i++)
                {
                    if (TextToSearchFor.Contains(renderTextArray[i]))
                    {
                        if (charBottomLeft != null && charTopRight != null)
                        {
                            var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderTextArray[i], this.TextToSearchFor, this.CompareOptions);
                            var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
                            //Grab the first and last character
                            var firstChar = chars.First();
                            var lastChar = chars.Last();


                            //Get the bounding box for the chunk of text
                            var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
                            var topRight = lastChar.GetAscentLine().GetEndPoint();
                            if (FoundChars.foundCharsList.Contains(renderTextArray[i] + bottomLeft + topRight))
                                return;

                            formedText = formedText + renderTextArray[i];
                            if (Counter > 0)
                            {
                                charBottomLeft = ReAllocate(charBottomLeft, Counter + 1);
                                charTopRight = ReAllocate(charTopRight, Counter + 1);
                            }
                            charBottomLeft[Counter] = bottomLeft;
                            charTopRight[Counter] = topRight;
                            Counter++;
                            FoundChars.foundCharsList.Add(renderTextArray[i] + bottomLeft + topRight);
                            if (formedText == TextToSearchFor)
                            {
                                var bLeft = charBottomLeft[0];
                                var tRight = charTopRight[Counter - 1];
                                Counter = 0;
                                formedText = "";
                                charBottomLeft = null;
                                charTopRight = null;
                                var rect = new iTextSharp.text.Rectangle(
                                                               bLeft[Vector.I1],
                                                               bLeft[Vector.I2],
                                                               tRight[Vector.I1],
                                                               tRight[Vector.I2]
                                                               );

                                IntegerRectangle TempRect = new IntegerRectangle();
                                TempRect.Top = (int)Math.Truncate(rect.Top);
                                TempRect.Bottom = (int)Math.Truncate(rect.Bottom);
                                TempRect.Left = (int)Math.Truncate(rect.Left);
                                TempRect.Right = (int)Math.Truncate(rect.Right);


                                //Add this to our main collection
                                this.myPoints.Add(new RectAndText(TempRect, rect, this.TextToSearchFor, WordIndex));
                            }
                        }
                    }
                }
            }
        }
    }
    private Vector[] ReAllocate(Vector[] arr,int counter)
    {

            Vector[] ReAllocatedArray = new Vector[counter];

                for (int j = 0; j < arr.Length; j++)
                {
                  ReAllocatedArray[j] = arr[j];
                }

            return ReAllocatedArray;

      }
   }
}

在某些情况下,它正常工作,而在某些情况下则不然。 是否有其他可行的方法来获取特定单词的坐标。

1 个答案:

答案 0 :(得分:1)

解决此问题的最佳方法是查看SimpleTextExtractionStrategy的工作原理。 在此策略中,iText还处理块并将其转换为字符串。

一般工作流程是:

  • 获取所有TextRenderInfo事​​件
  • 将它们转换为CharacterRenderInfo事​​件
  • 按逻辑阅读顺序对CharacterRenderInfo事​​件列表进行排序
  • 查看列表,如果它们靠得很近,则会将字符聚合成单词(这是一种启发式方法,iText使用&#39;小于给定字体中单个空格的宽度&#39;)。
  • 现在你有了边界(由CharacterRenderInfo.getBoundingBox提供)和单词