我使用pdfbox实现了从任何pdf中提取单词的功能。它逐行提取单词。以下是我为此目的使用的课程:
class PDFTextLocationStripper : PDFTextStripper
{
public string textWithPostion = "";
public Dictionary<float, Dictionary<float, PdfWord>> pdfWordsByXByY;
public PDFTextLocationStripper(): base()
{
try
{
textWithPostion = "";
pdfWordsByXByY = new Dictionary<float, Dictionary<float, PdfWord>>();
}
catch (Exception ex)
{
}
}
protected override void processTextPosition(TextPosition text)
{
try
{
float textX = text.getXDirAdj();
float textY = text.getYDirAdj();
if (!String.IsNullOrWhiteSpace(text.getUnicode()))
{
if (pdfWordsByXByY.ContainsKey(textY))
{
Dictionary<float, PdfWord> wordsByX = pdfWordsByXByY[textY];
if (wordsByX.ContainsKey(textX))
{
PdfWord word = wordsByX[textX];
wordsByX.Remove(word.Right);
word.EndCharWidth = text.getWidthDirAdj();
if (text.getHeightDir() > word.Height)
{
word.Height = text.getHeightDir();
}
word.EndX = textX;
word.Text += text.getUnicode();
if (!wordsByX.Keys.Contains(word.Right))
{
wordsByX.Add(word.Right, word);
}
}
else
{
float requiredX = -1;
float minDiff = float.MaxValue;
for (int index = 0; index < wordsByX.Keys.Count; index++)
{
float key = wordsByX.Keys.ElementAt(index);
float diff = key - textX;
if (diff < 0)
{
diff = -diff;
}
if (diff < minDiff)
{
minDiff = diff;
requiredX = key;
}
}
if (requiredX > -1 && minDiff <= 1)
{
PdfWord word = wordsByX[requiredX];
wordsByX.Remove(requiredX);
word.EndCharWidth = text.getWidthDirAdj();
if (text.getHeightDir() > word.Height)
{
word.Height = text.getHeightDir();
}
word.EndX = textX;
word.Text += text.getUnicode();
if (!wordsByX.ContainsKey(word.Right))
{
wordsByX.Add(word.Right, word);
}
}
else
{
PdfWord word = new PdfWord();
word.Text = text.getUnicode();
word.EndX = word.StartX = textX;
word.Y = textY;
word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
if (!wordsByX.ContainsKey(word.Right))
{
wordsByX.Add(word.Right, word);
}
pdfWordsByXByY[textY] = wordsByX;
}
}
}
else
{
Dictionary<float, PdfWord> wordsByX = new Dictionary<float, PdfWord>();
PdfWord word = new PdfWord();
word.Text = text.getUnicode();
word.EndX = word.StartX = textX;
word.Y = textY;
word.EndCharWidth = word.StartCharWidth = text.getWidthDirAdj();
word.Height = text.getHeightDir();
wordsByX.Add(word.Right, word);
pdfWordsByXByY.Add(textY, wordsByX);
}
}
}
catch (Exception ex)
{
}
}
}
这里是调用此类的代码:
private Dictionary<float, Dictionary<float, PdfWord>> ExtractTextWithLocation(PDDocument doc)
{
try
{
PDFTextLocationStripper textStripper = new PDFTextLocationStripper();
textStripper.setSortByPosition(true);
textStripper.getText(doc);
return textStripper.pdfWordsByXByY;
}
catch (Exception ex)
{
return null;
}
}
此代码提取水平对齐的单词,但如何实现检测垂直或倾斜单词的功能?