I am new to work on ITextSharp. I am able to extract the text (with correct word separation) when I implement SimpleTextExtractionStrategy, but I need the font information (like font family and font size) and I implemented the solution given in following link. It works great in terms of font information extraction but problem raised is that in output text spaces between words is lost and all characters are displayed without spaces. For example the text "Hello World" is correctly displayed by SimpleTextExtractionStrategy but it is displayed as "HelloWorld" when the given solution is implemented. Any help?. Thanks in advance. A portion of code is copied
private void Form1_Load(object sender, EventArgs e)
{
String filePath;
filePath = "C:\\paper1.pdf";
if (File.Exists(filePath))
{
PdfReader reader = new PdfReader(filePath);
TextWithFontExtractionStrategy S = new TextWithFontExtractionStrategy();
SimpleTextExtractionStrategy st = new SimpleTextExtractionStrategy();
string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);
string r = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, st);
MessageBox.Show(F);
this.Close();
}
else
{
MessageBox.Show("Could not locate the file");
}
}
}
public class TextWithFontExtractionStrategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
{
private XmlHandeler xmlHandeler = new XmlHandeler();
private string textToWrite;
private StringBuilder result = new StringBuilder();
private Vector lastBaseLine;
private string lastFont;
private float lastFontSize;
private bool isBold = false;
private enum TextRenderMode
{
FillText = 0,
StrokeText = 1,
FillThenStrokeText = 2,
Invisible = 3,
FillTextAndAddToPathForClipping = 4,
StrokeTextAndAddToPathForClipping = 5,
FillThenStrokeTextAndAddToPathForClipping = 6,
AddTextToPaddForClipping = 7
}
public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
{
string curFont = renderInfo.GetFont().PostscriptFontName;
if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
{
isBold = true;
}
Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
Single curFontSize = rect.Height;
if (curFont != lastFont || curFontSize != lastFontSize)
{
xmlHandeler.createNode(curFont, curFontSize, isBold, result.ToString());
result.Clear();
}
this.result1.Append(renderInfo.GetText());
//Set currently used properties
this.lastBaseLine = curBaseline;
this.lastFontSize = curFontSize;
this.lastFont = curFont;
}
public string GetResultantText()
{
if (result.Length > 0)
{
xmlHandeler.writeEndString();
}
xmlHandeler.createNode("abc", 12f, isBold, result.ToString());
return result.ToString();
}
//Not needed
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderImage(ImageRenderInfo renderInfo) { }
}
答案 0 :(得分:0)
我通过使用SimpleTextExtractionStrategy的renderText方法中给出的代码得到了解决方案。一些代码
Vector start = segment.GetStartPoint();
Vector end = segment.GetEndPoint();
bool firstRender = result.Length == 0;
bool hardReturn = false;
if (!firstRender)
{
Vector x0 = start;
Vector x1 = lastStart;
Vector x2 = lastEnd;
float dist = (x2.Subtract(x1)).Cross((x1.Subtract(x0))).LengthSquared / x2.Subtract(x1).LengthSquared;
float sameLineThreshold = 1f; // we should probably base this on the current font metrics, but 1 pt seems to be sufficient for the time being
if (dist > sameLineThreshold)
hardReturn = true;
}
if (hardReturn)
{
//AppendTextChunk('\n');
AppendTextChunk(' ');
}
else if (!firstRender)
{
if (result[result.Length - 1] != ' ' && renderInfo.GetText().Length > 0 && renderInfo.GetText()[0] != ' ')
{
// we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
float spacing = lastEnd.Subtract(start).Length;
if (spacing > renderInfo.GetSingleSpaceWidth() / 2f)
{
AppendTextChunk(' ');
}
}
}