我有一个应用程序,它从pdf文件中提取文本和矩形以供进一步分析。我使用ItextSharp进行提取,一切顺利,直到我偶然发现了一个文档,它有一些奇怪的表格单元格矩形。我检索的绘图命令中的值看起来比后面的矩形的实际尺寸大10倍。
只是一个例子:
2577 831.676 385.996 3.99609 re
同时,在查看文档时,所有矩形似乎都正确地适合文档页面的边界。我的猜测是应该有一些缩放命令,告诉我们这些值应该按比例缩小。假设是正确的,或者如何可能,这样的大矩形被渲染,它们是否保留在页面的边界内?
pdf文档位于此链接后面:https://www.dropbox.com/s/gyvon0dwk6a9cj0/prEVS_ISO_11620_KOM_et.pdf?dl=0
处理从PRStream中提取维度的代码如下:
private static List<PdfRect> GetRectsAndLinesFromStream(PRStream stream)
{
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
List<string> newBuf = new List<string>();
List<PdfRect> rects = new List<PdfRect>();
List<string> allTokens = new List<string>();
float[,] ctm = null;
List<float[,]> ctms = new List<float[,]>();
//if current ctm has not yet been added to list
bool pendingCtm = false;
//format definition for string-> float conversion
var format = new System.Globalization.NumberFormatInfo();
format.NegativeSign = "-";
while (tokenizer.NextToken())
{
//Add them to our master buffer
newBuf.Add(tokenizer.StringValue);
if (
tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "re"
)
{
float startPointX = (float)double.Parse(newBuf[newBuf.Count - 5], format);
float startPointY = (float)double.Parse(newBuf[newBuf.Count - 4], format);
float width = (float)double.Parse(newBuf[newBuf.Count - 3], format);
float height = (float)double.Parse(newBuf[newBuf.Count - 2], format);
float endPointX = startPointX + width;
float endPointY = startPointY + height;
//if transformation is defined, correct coordinates
if (ctm!=null)
{
//extract parameters
float a = ctm[0, 0];
float b = ctm[0, 1];
float c = ctm[1, 0];
float d = ctm[1, 1];
float e = ctm[2, 0];
float f = ctm[2, 1];
//reverse transformation to get x and y from x' and y'
startPointX = (startPointX - startPointY * c - e) / a;
startPointY = (startPointY - startPointX * b - f) / d;
endPointX = (endPointX - endPointY * c - e) / a;
endPointY = (endPointY - endPointX * b - f) / d;
}
rects.Add(new PdfRect(startPointX, startPointY , endPointX , endPointY ));
}
//store current ctm
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "q")
{
if (ctm != null)
{
ctms.Add(ctm);
pendingCtm = false;
}
}
//fetch last ctm and remove it from list
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "Q")
{
if (ctms.Count > 0)
{
ctm = ctms[ctms.Count - 1];
ctms.RemoveAt(ctms.Count -1 );
}
}
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "cm")
{
// x' = x*a + y*c + e ; y' = x*b + y*d + f
float a = (float)double.Parse(newBuf[newBuf.Count - 7], format);
float b = (float)double.Parse(newBuf[newBuf.Count - 6], format);
float c = (float)double.Parse(newBuf[newBuf.Count - 5], format);
float d = (float)double.Parse(newBuf[newBuf.Count - 4], format);
float e = (float)double.Parse(newBuf[newBuf.Count - 3], format);
float f = (float)double.Parse(newBuf[newBuf.Count - 2], format);
float[,] tempCtm = ctm;
ctm = new float[3, 3] {
{a,b,0},
{c,d,0},
{e,f,1}
};
//multiply matrices to form 1 transformation matrix
if (pendingCtm && tempCtm != null)
{
float[,] resultantCtm;
if (!TryMultiplyMatrix(tempCtm, ctm, out resultantCtm))
{
throw new InvalidOperationException("Invalid transform matrix");
}
ctm = resultantCtm;
}
//current CTM has not yet been saved to stack
pendingCtm = true;
}
return rects;
}
答案 0 :(得分:2)
您要查找的命令是cm
。你看过The ABC of PDF with iText了吗?这本书还没有完成,但你已经可以下载前五章了。
这是显示cm
运算符的表格的屏幕截图:
这是使用相同语法以完全相同的方式创建的5个形状的示例
:
由于图形状态的变化,它们被添加到不同的位置,即使是不同的大小和形状:坐标系被更改,形状在更改的坐标系中呈现。