我想在一组PDF文件中突出显示几个关键字。首先,我们必须识别单个单词并将其与我的关键字匹配。我找到了一个例子:
class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy
{
//Hold each coordinate
public List<RectAndText> myPoints = new List<RectAndText>();
List<string> topicTerms;
public MyLocationTextExtractionStrategy(List<string> topicTerms)
{
this.topicTerms = topicTerms;
}
//Automatically called for each chunk of text in the PDF
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
//Get the bounding box for the chunk of text
var bottomLeft = renderInfo.GetDescentLine().GetStartPoint();
var topRight = renderInfo.GetAscentLine().GetEndPoint();
//Create a rectangle from it
var rect = new iTextSharp.text.Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]
);
//Add this to our main collection
//filter the meaingless words
string text = renderInfo.GetText();
this.myPoints.Add(new RectAndText(rect, renderInfo.GetText()));
但是,我发现很多单词都被打破了。例如,“stop”将是“st”和“op”。有没有其他方法可以识别单个单词及其位置?
答案 0 :(得分:0)
当您想要收集单个单词及其协调时,更好的方法是覆盖现有的LocationTextExtractionStrategy。这是我的代码:
public virtual String GetResultantText(ITextChunkFilter chunkFilter){
if (DUMP_STATE) {
DumpState();
}
List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);
filteredTextChunks.Sort();
List<RectAndText> tmpList = new List<RectAndText>();
StringBuilder sb = new StringBuilder();
TextChunk lastChunk = null;
foreach (TextChunk chunk in filteredTextChunks) {
if (lastChunk == null){
sb.Append(chunk.Text);
var startLocation = chunk.StartLocation;
var endLocation = chunk.EndLocation;
var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]);
tmpList.Add(new RectAndText(rect, chunk.Text));
} else {
if (chunk.SameLine(lastChunk)){
// we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
if (IsChunkAtWordBoundary(chunk, lastChunk) && !StartsWithSpace(chunk.Text) && !EndsWithSpace(lastChunk.Text))
{
sb.Append(' ');
if (tmpList.Count > 0)
{
mergeAndStoreChunk(tmpList);
tmpList.Clear();
}
}
sb.Append(chunk.Text);
var startLocation = chunk.StartLocation;
var endLocation = chunk.EndLocation;
var rect = new iTextSharp.text.Rectangle(startLocation[0], startLocation[1], endLocation[0], endLocation[1]);
////var topRight = renderInfo.GetAscentLine().GetEndPoint();
tmpList.Add(new RectAndText(rect,chunk.Text));
} else {
sb.Append('\n');
sb.Append(chunk.Text);
}
}
lastChunk = chunk;
}
return sb.ToString();
}
private void mergeAndStoreChunk(List<RectAndText> tmpList)
{
RectAndText mergedChunk = tmpList[0];
int tmpListCount = tmpList.Count();
for (int i = 1; i < tmpListCount; i++)
{
RectAndText nowChunk = tmpList[i];
mergedChunk.Rect.Right = nowChunk.Rect.Right;
mergedChunk.Text += nowChunk.Text;
}
this.myPoints.Add(mergedChunk);
}
myPoints是一个列表,它将返回我们想要的所有内容。