我正在尝试在pdf文件中搜索文本,并在文本存在时返回坐标。我正在研究网络并发现可以使用itextsharp库来完成。
我找到了这段代码,我正在尝试修改它以满足我的要求。如何将我的文件传递给这个类。
class Program
{
static void Main(string[] args)
{
var testFile = @"test.pdf";
//Create an instance of our strategy
var t = new MyLocationTextExtractionStrategy("test");
//Parse page 1 of the document above
using (var r = new PdfReader(testFile))
{
var ex = PdfTextExtractor.GetTextFromPage(r, 1, t);
}
//Loop through each chunk found
foreach (var p in t.myPoints)
{
Console.WriteLine(string.Format("Found text {0} at {1}x{2}", p.Text, p.Rect.Left, p.Rect.Bottom));
}
Console.Read();
}
public class RectAndText
{
public iTextSharp.text.Rectangle Rect;
public String Text;
public RectAndText(iTextSharp.text.Rectangle rect, String text)
{
this.Rect = rect;
this.Text = text;
}
}
public class MyLocationTextExtractionStrategy : LocationTextExtractionStrategy
{
//Hold each coordinate
public List<RectAndText> myPoints = new List<RectAndText>();
//The string that we're searching for
public String TextToSearchFor { get; set; }
//How to compare strings
public System.Globalization.CompareOptions CompareOptions { get; set; }
public MyLocationTextExtractionStrategy(String textToSearchFor, System.Globalization.CompareOptions compareOptions = System.Globalization.CompareOptions.None)
{
this.TextToSearchFor = textToSearchFor;
this.CompareOptions = compareOptions;
}
//Automatically called for each chunk of text in the PDF
public override void RenderText(TextRenderInfo renderInfo)
{
base.RenderText(renderInfo);
//See if the current chunk contains the text
var startPosition = System.Globalization.CultureInfo.CurrentCulture.CompareInfo.IndexOf(renderInfo.GetText(), this.TextToSearchFor, this.CompareOptions);
//If not found bail
if (startPosition < 0)
{
return;
}
//Grab the individual characters
var chars = renderInfo.GetCharacterRenderInfos().Skip(startPosition).Take(this.TextToSearchFor.Length).ToList();
//Grab the first and last character
var firstChar = chars.First();
var lastChar = chars.Last();
//Get the bounding box for the chunk of text
var bottomLeft = firstChar.GetDescentLine().GetStartPoint();
var topRight = lastChar.GetAscentLine().GetEndPoint();
//Create a rectangle from it
var rect = new iTextSharp.text.Rectangle(
bottomLeft[Vector.I1],
bottomLeft[Vector.I2],
topRight[Vector.I1],
topRight[Vector.I2]
);
//Add this to our main collection
this.myPoints.Add(new RectAndText(rect, this.TextToSearchFor));
}
}
答案 0 :(得分:1)
在我看来,PdfReader接受一个字符串,该字符串是您要读取的文件的路径。所以只需改变
{{1}}
指向您要使用的文件。 (如果它不在应用程序的工作文件夹中,您可能必须添加完整路径)
答案 1 :(得分:0)
OMG!你的代码对于那个任务来说太大了。 我可以推荐很简单的解决方案。 请看下面的
//Open PDF document
using (var doc = PdfDocument.Load(@"d:\0\test_big.pdf"))
{
//Enumerate pages
foreach(var page in doc.Pages)
{
var found = page.Text.Find("text for search", FindFlags.MatchWholeWord, 0);
if (found == null)
return; //nothing found
do
{
var textInfo = found.FindedText;
foreach(var rect in textInfo.Rects)
{
float x = rect.left;
float y = rect.top;
//...
}
} while (found.FindNext());
page.Dispose();
}
}