关注this actual solution我试图获取TextChunk
及其每个坐标(actual page
,top
,bottom
,{{ 1}},left
)。
由于right
可能是一个短语,一个单词或其他什么,我试图手动执行此操作,依次计算最后一个单词的矩形并每次切割它。我注意到这种手动方法可能很麻烦(我需要手动指望特殊字符等等),所以我问自己ITextSharp是否提供了更简单的方法来执行此操作。
我的TextChunk
和Chunk
继承的类如下:
LocationTextExtractionStragy
所以一旦我得到文件等等,我就这样继续:
public class Chunk
{
public Guid Id { get; set; }
public Rectangle Rect { get; set; }
public TextRenderInfo Render { get; set; }
public BaseFont BF { get; set; }
public string Text { get; set; }
public int FontSize { get; set; }
public Chunk(Rectangle rect, TextRenderInfo renderInfo)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = Render.GetText();
Initialize();
}
public Chunk(Rectangle rect, TextRenderInfo renderInfo, string text)
{
this.Rect = rect;
this.Render = renderInfo;
this.Text = text;
Initialize();
}
private void Initialize()
{
this.Id = Guid.NewGuid();
this.BF = Render.GetFont();
this.FontSize = ObtainFontSize();
}
private int ObtainFontSize()
{
return Convert.ToInt32(this.Render.GetSingleSpaceWidth() * 12 / this.BF.GetWidthPoint(" ", 12));
}
}
public class LocationTextExtractionPersonalizada : LocationTextExtractionStrategy
{
//Save each coordinate
public List<Chunk> ChunksInPage = new List<Chunk>();
//Automatically called on each chunk on PDF
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
if (string.IsNullOrWhiteSpace(renderInfo.GetText())
|| renderInfo == null)
return;
//Get chunk Vectors
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create Rectangle based on previous Vectors
var rect = new Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]);
if (rect == null)
return;
//Add each chunk with its coordinates
ChunksInPage.Add(new Chunk(rect, renderInfo));
}
}
之后,我写了一篇关于Mkl解决方案的评论,用“使用getCharacterRenderInfos()”回复,我使用它,我将每个字符都放入TextRenderInfo的列表中。
我很抱歉,但我开始混合概念,找出如何应用该解决方案的方法,并鼓吹我的想法。
我真的很感激这里的一只手。
答案 0 :(得分:2)
您可以使用方法TextRenderInfo.GetCharacterRenderInfos()
为块中的每个字符集获取TextRenderInfo
的集合。然后,您可以将单个字符重新组合为单词,并使用该单词中第一个和最后一个TextRenderInfo
的坐标计算包含该单词的矩形。
在自定义文本提取策略中:
var _separators = new[] { "-", "(", ")", "/", " ", ":", ";", ",", "."};
protected virtual void ParseRenderInfo(TextRenderInfo currentInfo)
{
var resultInfo = new List<TextRenderInfo>();
var chars = currentInfo.GetCharacterRenderInfos();
foreach (var charRenderInfo in chars)
{
resultInfo.Add(charRenderInfo);
var currentChar = charRenderInfo.GetText();
if (_separators.Contains(currentChar))
{
ProcessWord(currentInfo, resultInfo);
resultInfo.Clear();
}
}
ProcessWord(currentInfo, resultInfo);
}
private void ProcessWord(TextRenderInfo charChunk, List<TextRenderInfo> wordChunks)
{
var firstRender = wordChunks.FirstOrDefault();
var lastRender = wordChunks.LastOrDefault();
if (firstRender == null || lastRender == null)
{
return;
}
var startCoords = firstRender.GetDescentLine().GetStartPoint();
var endCoords = lastRender.GetAscentLine().GetEndPoint();
var wordText = string.Join("", wordChunks.Select(x => x.GetText()));
var wordLocation = new LocationTextExtractionStrategy.TextChunkLocationDefaultImp(startCoords, endCoords, charChunk.GetSingleSpaceWidth());
_chunks.Add(new CustomTextChunk(wordText, wordLocation));
}