将每个单独单词的坐标提取到pdf文件中的TextChunk

时间:2017-10-04 11:36:10

标签: c# pdf itext

关注this actual solution我试图获取TextChunk及其每个坐标(actual pagetopbottom,{{ 1}},left)。

由于right可能是一个短语,一个单词或其他什么,我试图手动执行此操作,依次计算最后一个单词的矩形并每次切割它。我注意到这种手动方法可能很麻烦(我需要手动指望特殊字符等等),所以我问自己ITextSharp是否提供了更简单的方法来执行此操作。

我的TextChunkChunk继承的类如下:

LocationTextExtractionStragy

所以一旦我得到文件等等,我就这样继续:

public class Chunk
{
    public Guid Id { get; set; }
    public Rectangle Rect { get; set; }
    public TextRenderInfo Render { get; set; }
    public BaseFont BF { get; set; }
    public string Text { get; set; }
    public int FontSize { get; set; }


    public Chunk(Rectangle rect, TextRenderInfo renderInfo)
    {
        this.Rect = rect;
        this.Render = renderInfo;
        this.Text = Render.GetText();
        Initialize();
    }


    public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text)
    {
        this.Rect = rect;
        this.Render = renderInfo;
        this.Text = text;
        Initialize();
    }


    private void Initialize()
    {
        this.Id = Guid.NewGuid();
        this.BF = Render.GetFont();
        this.FontSize = ObtainFontSize();
    }

    private int ObtainFontSize()
    {
        return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12 / this.BF.GetWidthPoint(" ", 12));
    }
}

public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy
{
    //Save each coordinate
    public List<Chunk> ChunksInPage = new List<Chunk>();

    //Automatically called on each chunk on PDF
    public override void RenderText(TextRenderInfo renderInfo)
    {
        base.RenderText(renderInfo);
        if (string.IsNullOrWhiteSpace(renderInfo.GetText())
                || renderInfo == null)
                return;

        //Get chunk Vectors
        var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
        var topRight = renderInfo.GetAscentLine().GetEndPoint();

        //Create Rectangle based on previous Vectors
        var rect = new Rectangle(
                           bottomLeft[Vector.I1],
                           bottomLeft[Vector.I2],
                           topRight[Vector.I1],
                           topRight[Vector.I2]);

        if (rect == null)
                return;

        //Add each chunk with its coordinates
        ChunksInPage.Add(new Chunk(rect, renderInfo));
    }
}

之后,我写了一篇关于Mkl解决方案的评论,用“使用getCharacterRenderInfos()”回复,我使用它,我将每个字符都放入TextRenderInfo的列表中。

我很抱歉,但我开始混合概念,找出如何应用该解决方案的方法,并鼓吹我的想法。

我真的很感激这里的一只手。

1 个答案:

答案 0 :(得分:2)

您可以使用方法TextRenderInfo.GetCharacterRenderInfos()为块中的每个字符集获取TextRenderInfo的集合。然后,您可以将单个字符重新组合为单词,并使用该单词中第一个和最后一个TextRenderInfo的坐标计算包含该单词的矩形。

在自定义文本提取策略中:

 var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
 protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
    {
        var resultInfo = new List<TextRenderInfo>();
        var chars = currentInfo.GetCharacterRenderInfos();

        foreach (var charRenderInfo in chars)
        {
            resultInfo.Add(charRenderInfo);
            var currentChar = charRenderInfo.GetText();
            if (_separators.Contains(currentChar))
            {
                ProcessWord(currentInfo, resultInfo);
                resultInfo.Clear();
            }
        }
        ProcessWord(currentInfo, resultInfo);
    }
 private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
    {
        var firstRender = wordChunks.FirstOrDefault();
        var lastRender = wordChunks.LastOrDefault();
        if (firstRender == null || lastRender == null)
        {
            return;
        }
        var startCoords = firstRender.GetDescentLine().GetStartPoint();
        var endCoords = lastRender.GetAscentLine().GetEndPoint();
        var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
        var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
        _chunks.Add(new CustomTextChunk(wordText, wordLocation));
    }