我正在尝试从PDF文件中获取所有单词及其位置坐标。我在.NET
上成功使用了Acrobat API。现在,我正在尝试使用免费API获得相同的结果,例如iTextSharp(.NET
版本)。我可以用PRTokeniser
得到文本(逐行),但我不知道如何获取该行的坐标,更不用说每个单词了。
答案 0 :(得分:11)
我的帐户对Mark Storer的回答太新了。
我无法直接使用LocationTextExtracationStrategy(我想我一定做错了)。当我使用LocationTextExtracationStrategy时,我能够得到文本,但我无法弄清楚如何获得每个字符串(或字符串行)的坐标。
我最终继承了LocationTextExtracationStrategy的子类并公开了我想要的数据,因为它确实内部存在。
我也想在.net中...所以这里是我放在一起的一个草率的C#版本。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using iTextSharp.text.pdf.parser;
namespace PdfHelper
{
/// <summary>
/// Taken from http://www.java-frameworks.com/java/itext/com/itextpdf/text/pdf/parser/LocationTextExtractionStrategy.java.html
/// </summary>
class LocationTextExtractionStrategyEx : LocationTextExtractionStrategy
{
private List<TextChunk> m_locationResult = new List<TextChunk>();
private List<TextInfo> m_TextLocationInfo = new List<TextInfo>();
public List<TextChunk> LocationResult
{
get { return m_locationResult; }
}
public List<TextInfo> TextLocationInfo
{
get { return m_TextLocationInfo; }
}
/// <summary>
/// Creates a new LocationTextExtracationStrategyEx
/// </summary>
public LocationTextExtractionStrategyEx()
{
}
/// <summary>
/// Returns the result so far
/// </summary>
/// <returns>a String with the resulting text</returns>
public override String GetResultantText()
{
m_locationResult.Sort();
StringBuilder sb = new StringBuilder();
TextChunk lastChunk = null;
TextInfo lastTextInfo = null;
foreach (TextChunk chunk in m_locationResult)
{
if (lastChunk == null)
{
sb.Append(chunk.Text);
lastTextInfo = new TextInfo(chunk);
m_TextLocationInfo.Add(lastTextInfo);
}
else
{
if (chunk.sameLine(lastChunk))
{
float dist = chunk.distanceFromEndOf(lastChunk);
if (dist < -chunk.CharSpaceWidth)
{
sb.Append(' ');
lastTextInfo.addSpace();
}
//append a space if the trailing char of the prev string wasn't a space && the 1st char of the current string isn't a space
else if (dist > chunk.CharSpaceWidth / 2.0f && chunk.Text[0] != ' ' && lastChunk.Text[lastChunk.Text.Length - 1] != ' ')
{
sb.Append(' ');
lastTextInfo.addSpace();
}
sb.Append(chunk.Text);
lastTextInfo.appendText(chunk);
}
else
{
sb.Append('\n');
sb.Append(chunk.Text);
lastTextInfo = new TextInfo(chunk);
m_TextLocationInfo.Add(lastTextInfo);
}
}
lastChunk = chunk;
}
return sb.ToString();
}
/// <summary>
///
/// </summary>
/// <param name="renderInfo"></param>
public override void RenderText(TextRenderInfo renderInfo)
{
LineSegment segment = renderInfo.GetBaseline();
TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth(), renderInfo.GetAscentLine(), renderInfo.GetDescentLine());
m_locationResult.Add(location);
}
public class TextChunk : IComparable, ICloneable
{
string m_text;
Vector m_startLocation;
Vector m_endLocation;
Vector m_orientationVector;
int m_orientationMagnitude;
int m_distPerpendicular;
float m_distParallelStart;
float m_distParallelEnd;
float m_charSpaceWidth;
public LineSegment AscentLine;
public LineSegment DecentLine;
public object Clone()
{
TextChunk copy = new TextChunk(m_text, m_startLocation, m_endLocation, m_charSpaceWidth, AscentLine, DecentLine);
return copy;
}
public string Text
{
get { return m_text; }
set { m_text = value; }
}
public float CharSpaceWidth
{
get { return m_charSpaceWidth; }
set { m_charSpaceWidth = value; }
}
public Vector StartLocation
{
get { return m_startLocation; }
set { m_startLocation = value; }
}
public Vector EndLocation
{
get { return m_endLocation; }
set { m_endLocation = value; }
}
/// <summary>
/// Represents a chunk of text, it's orientation, and location relative to the orientation vector
/// </summary>
/// <param name="txt"></param>
/// <param name="startLoc"></param>
/// <param name="endLoc"></param>
/// <param name="charSpaceWidth"></param>
public TextChunk(string txt, Vector startLoc, Vector endLoc, float charSpaceWidth, LineSegment ascentLine, LineSegment decentLine)
{
m_text = txt;
m_startLocation = startLoc;
m_endLocation = endLoc;
m_charSpaceWidth = charSpaceWidth;
AscentLine = ascentLine;
DecentLine = decentLine;
m_orientationVector = m_endLocation.Subtract(m_startLocation).Normalize();
m_orientationMagnitude = (int)(Math.Atan2(m_orientationVector[Vector.I2], m_orientationVector[Vector.I1]) * 1000);
// see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
// the two vectors we are crossing are in the same plane, so the result will be purely
// in the z-axis (out of plane) direction, so we just take the I3 component of the result
Vector origin = new Vector(0, 0, 1);
m_distPerpendicular = (int)(m_startLocation.Subtract(origin)).Cross(m_orientationVector)[Vector.I3];
m_distParallelStart = m_orientationVector.Dot(m_startLocation);
m_distParallelEnd = m_orientationVector.Dot(m_endLocation);
}
/// <summary>
/// true if this location is on the the same line as the other text chunk
/// </summary>
/// <param name="textChunkToCompare">the location to compare to</param>
/// <returns>true if this location is on the the same line as the other</returns>
public bool sameLine(TextChunk textChunkToCompare)
{
if (m_orientationMagnitude != textChunkToCompare.m_orientationMagnitude) return false;
if (m_distPerpendicular != textChunkToCompare.m_distPerpendicular) return false;
return true;
}
/// <summary>
/// Computes the distance between the end of 'other' and the beginning of this chunk
/// in the direction of this chunk's orientation vector. Note that it's a bad idea
/// to call this for chunks that aren't on the same line and orientation, but we don't
/// explicitly check for that condition for performance reasons.
/// </summary>
/// <param name="other"></param>
/// <returns>the number of spaces between the end of 'other' and the beginning of this chunk</returns>
public float distanceFromEndOf(TextChunk other)
{
float distance = m_distParallelStart - other.m_distParallelEnd;
return distance;
}
/// <summary>
/// Compares based on orientation, perpendicular distance, then parallel distance
/// </summary>
/// <param name="obj"></param>
/// <returns></returns>
public int CompareTo(object obj)
{
if (obj == null) throw new ArgumentException("Object is now a TextChunk");
TextChunk rhs = obj as TextChunk;
if (rhs != null)
{
if (this == rhs) return 0;
int rslt;
rslt = m_orientationMagnitude - rhs.m_orientationMagnitude;
if (rslt != 0) return rslt;
rslt = m_distPerpendicular - rhs.m_distPerpendicular;
if (rslt != 0) return rslt;
// note: it's never safe to check floating point numbers for equality, and if two chunks
// are truly right on top of each other, which one comes first or second just doesn't matter
// so we arbitrarily choose this way.
rslt = m_distParallelStart < rhs.m_distParallelStart ? -1 : 1;
return rslt;
}
else
{
throw new ArgumentException("Object is now a TextChunk");
}
}
}
public class TextInfo
{
public Vector TopLeft;
public Vector BottomRight;
private string m_Text;
public string Text
{
get { return m_Text; }
}
/// <summary>
/// Create a TextInfo.
/// </summary>
/// <param name="initialTextChunk"></param>
public TextInfo(TextChunk initialTextChunk)
{
TopLeft = initialTextChunk.AscentLine.GetStartPoint();
BottomRight = initialTextChunk.DecentLine.GetEndPoint();
m_Text = initialTextChunk.Text;
}
/// <summary>
/// Add more text to this TextInfo.
/// </summary>
/// <param name="additionalTextChunk"></param>
public void appendText(TextChunk additionalTextChunk)
{
BottomRight = additionalTextChunk.DecentLine.GetEndPoint();
m_Text += additionalTextChunk.Text;
}
/// <summary>
/// Add a space to the TextInfo. This will leave the endpoint out of sync with the text.
/// The assumtion is that you will add more text after the space which will correct the endpoint.
/// </summary>
public void addSpace()
{
m_Text += ' ';
}
}
}
}
我添加了一个TextLocationInfo属性,该属性可以移回文本行的列表+这些行(左上角和右下角)的坐标,可用于为您提供边界框。
我最初玩的时候也看到了一些奇怪的东西。如果我拉开startPoint&amp ;;看起来我得到了相同的坐标。从基线开始的终点(我认为正确的事情,我做的事情是从ascentLine和DecentLine中提取这些点)。我刚开始使用基线。奇怪的是,我没有看到由此产生的坐标有什么不同。所以要警惕......我不确定我提供的坐标是否正确......我只是认为它们应该是。
答案 1 :(得分:9)
您需要使用com.itextpdf.text.pdf.parser包类。它们跟踪当前的转换,颜色,字体等。
可悲的是,这些课程没有在新书中介绍过,所以你留下了JavaDoc,并将它从精神上全部从Java转换为C#,这不是一件容易的事。
因此,您需要将LocationTextExtractionStrategy
插入PdfTextExtractor
。
这将为您提供字符串和位置,因为它们在pdf中显示。您可以将其解释为单词(以及段落,如果需要,ouch)。
请记住,PDF对文本布局一无所知。每个角色都可以单独放置。如果有人如此倾向(并且他们必须是一些没有组合拼盘的玉米饼)他们可以在给定的页面上绘制所有'a',然后是所有'b',等等。
更现实地,有人可能会在使用FontA的页面上绘制所有文本,然后绘制FontB的所有内容,依此类推。这可以产生更有效的内容流。请注意,斜体和粗体(以及 粗体斜体 )都是单独的字体。如果有人将单词的一部分标记为粗体(或其他),则需要将该逻辑单词分解为至少两个绘图命令。
但是很多人只是按照逻辑顺序将他们的文本写成PDF ...这对于那些试图解析它的人来说非常方便,但你不能期待它。因为你总会碰到一些没有的奇怪的球。