我已经实现了自己的LocationTextExtractionStrategy。某些pdfs中的renderinfo将单词的组合读取为块,而在其他pdf中则逐字符读取
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using iTextSharp.text.pdf.parser;
namespace PDFAnnotater
{
public class TestLTES : LocationTextExtractionStrategy
{
int WordIndex;
//Hold each coordinate
public List<RectAndText> myPoints = new List<RectAndText>();
public string formedText="";
public Vector[] charBottomLeft;
public Vector[] charTopRight;
public int Counter=0;
//The string that we're searching for
public string TextToSearchFor { get; set; }
public bool found = false;
//How to compare strings
public System.Globalization.CompareOptions CompareOptions { get; set; }
public TestLTES(string textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
{
TextToSearchFor = textToSearchFor;
CompareOptions = compareOptions;
charBottomLeft = new Vector[1];
charTopRight = new Vector[1];
}
public TestLTES(int index, string textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
{
TextToSearchFor = textToSearchFor;
CompareOptions = compareOptions;
WordIndex = index;
}
//Automatically called for each chunk of text in the PDF
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
//See if the current chunk contains the text
if (renderInfo.GetText().Length >= this.TextToSearchFor.Length)
{
if (renderInfo.GetText().Split().Contains(this.TextToSearchFor))
{
var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderInfo.GetText(), this.TextToSearchFor, this.CompareOptions);
//Grab the individual characters
var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
var textpos = renderInfo.GetText();
//.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
//Grab the first and last character
var firstChar = chars.First();
var lastChar = chars.Last();
//Get the bounding box for the chunk of text
var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
var topRight = lastChar.GetAscentLine().GetEndPoint();
//Create a rectangle from it
var rect = new iTextSharp.text.Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]
);
IntegerRectangle TempRect = new IntegerRectangle();
TempRect.Top = (int)Math.Truncate(rect.Top);
TempRect.Bottom = (int)Math.Truncate(rect.Bottom);
TempRect.Left = (int)Math.Truncate(rect.Left);
TempRect.Right = (int)Math.Truncate(rect.Right);
//Add this to our main collection
this.myPoints.Add(new RectAndText(TempRect, rect, this.TextToSearchFor, WordIndex));
}
else
{
return;
}
}
else
{
if (renderInfo.GetText() != "" && renderInfo.GetText()!=" ")
{
string[] renderTextArray = renderInfo.GetText().Split();
for (int i = 0; i < renderTextArray.Length; i++)
{
if (TextToSearchFor.Contains(renderTextArray[i]))
{
if (charBottomLeft != null && charTopRight != null)
{
var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderTextArray[i], this.TextToSearchFor, this.CompareOptions);
var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
//Grab the first and last character
var firstChar = chars.First();
var lastChar = chars.Last();
//Get the bounding box for the chunk of text
var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
var topRight = lastChar.GetAscentLine().GetEndPoint();
if (FoundChars.foundCharsList.Contains(renderTextArray[i] + bottomLeft + topRight))
return;
formedText = formedText + renderTextArray[i];
if (Counter > 0)
{
charBottomLeft = ReAllocate(charBottomLeft, Counter + 1);
charTopRight = ReAllocate(charTopRight, Counter + 1);
}
charBottomLeft[Counter] = bottomLeft;
charTopRight[Counter] = topRight;
Counter++;
FoundChars.foundCharsList.Add(renderTextArray[i] + bottomLeft + topRight);
if (formedText == TextToSearchFor)
{
var bLeft = charBottomLeft[0];
var tRight = charTopRight[Counter - 1];
Counter = 0;
formedText = "";
charBottomLeft = null;
charTopRight = null;
var rect = new iTextSharp.text.Rectangle(
bLeft[Vector.I1],
bLeft[Vector.I2],
tRight[Vector.I1],
tRight[Vector.I2]
);
IntegerRectangle TempRect = new IntegerRectangle();
TempRect.Top = (int)Math.Truncate(rect.Top);
TempRect.Bottom = (int)Math.Truncate(rect.Bottom);
TempRect.Left = (int)Math.Truncate(rect.Left);
TempRect.Right = (int)Math.Truncate(rect.Right);
//Add this to our main collection
this.myPoints.Add(new RectAndText(TempRect, rect, this.TextToSearchFor, WordIndex));
}
}
}
}
}
}
}
private Vector[] ReAllocate(Vector[] arr,int counter)
{
Vector[] ReAllocatedArray = new Vector[counter];
for (int j = 0; j < arr.Length; j++)
{
ReAllocatedArray[j] = arr[j];
}
return ReAllocatedArray;
}
}
}
在某些情况下,它正常工作,而在某些情况下则不然。 是否有其他可行的方法来获取特定单词的坐标。
答案 0 :(得分:1)
解决此问题的最佳方法是查看SimpleTextExtractionStrategy的工作原理。 在此策略中,iText还处理块并将其转换为字符串。
一般工作流程是: