Question

我有一个应用程序，它从pdf文件中提取文本和矩形以供进一步分析。我使用ItextSharp进行提取，一切顺利，直到我偶然发现了一个文档，它有一些奇怪的表格单元格矩形。我检索的绘图命令中的值看起来比后面的矩形的实际尺寸大10倍。

只是一个例子：

2577 831.676 385.996 3.99609 re

同时，在查看文档时，所有矩形似乎都正确地适合文档页面的边界。我的猜测是应该有一些缩放命令，告诉我们这些值应该按比例缩小。假设是正确的，或者如何可能，这样的大矩形被渲染，它们是否保留在页面的边界内？

pdf文档位于此链接后面：https://www.dropbox.com/s/gyvon0dwk6a9cj0/prEVS_ISO_11620_KOM_et.pdf?dl=0

处理从PRStream中提取维度的代码如下：

private static List<PdfRect> GetRectsAndLinesFromStream(PRStream stream)
    {
        var streamBytes = PdfReader.GetStreamBytes(stream);
        var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
        List<string> newBuf = new List<string>();
        List<PdfRect> rects = new List<PdfRect>();
        List<string> allTokens = new List<string>();

        float[,] ctm = null;
        List<float[,]>  ctms = new List<float[,]>();


        //if current ctm has not yet been added to list
        bool pendingCtm = false;


        //format definition for string-> float conversion
        var format = new System.Globalization.NumberFormatInfo();
        format.NegativeSign = "-";

        while (tokenizer.NextToken())
        {

            //Add them to our master buffer
            newBuf.Add(tokenizer.StringValue);

            if (
                tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "re"
                )
            {
                float startPointX = (float)double.Parse(newBuf[newBuf.Count - 5], format);
                float startPointY = (float)double.Parse(newBuf[newBuf.Count - 4], format);
                float width = (float)double.Parse(newBuf[newBuf.Count - 3], format);
                float height = (float)double.Parse(newBuf[newBuf.Count - 2], format); 

                float endPointX = startPointX + width;
                float endPointY = startPointY + height;

                //if transformation is defined, correct coordinates
                if (ctm!=null)
                {
                    //extract parameters
                    float a = ctm[0, 0];
                    float b = ctm[0, 1];
                    float c = ctm[1, 0];
                    float d = ctm[1, 1];
                    float e = ctm[2, 0];
                    float f = ctm[2, 1];

                   //reverse transformation to get x and y from x' and y' 
                    startPointX = (startPointX - startPointY * c - e) / a; 
                    startPointY = (startPointY - startPointX * b - f) / d; 
                    endPointX = (endPointX - endPointY * c - e) / a;
                    endPointY = (endPointY - endPointX * b - f) / d; 
                }

                rects.Add(new PdfRect(startPointX, startPointY , endPointX  , endPointY ));
            }
            //store current ctm
            else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "q")
            {
                if (ctm != null)
                {
                    ctms.Add(ctm);
                    pendingCtm = false;
                }
            }
            //fetch last ctm and remove it from list
            else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "Q")
            {
                if (ctms.Count > 0)
                {
                    ctm = ctms[ctms.Count - 1];
                    ctms.RemoveAt(ctms.Count -1 );
                }
            }
            else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "cm")
            {

                // x' = x*a + y*c + e ; y' = x*b + y*d + f
                float a = (float)double.Parse(newBuf[newBuf.Count - 7], format);
                float b = (float)double.Parse(newBuf[newBuf.Count - 6], format);
                float c = (float)double.Parse(newBuf[newBuf.Count - 5], format);
                float d = (float)double.Parse(newBuf[newBuf.Count - 4], format);
                float e = (float)double.Parse(newBuf[newBuf.Count - 3], format);
                float f = (float)double.Parse(newBuf[newBuf.Count - 2], format);

                float[,] tempCtm = ctm;

                ctm = new float[3, 3] {
                {a,b,0},
                {c,d,0},
                {e,f,1}
                };

                //multiply matrices to form 1 transformation matrix
                if (pendingCtm && tempCtm != null)
                {
                    float[,] resultantCtm;
                    if (!TryMultiplyMatrix(tempCtm, ctm, out resultantCtm))
                    {
                        throw new InvalidOperationException("Invalid transform matrix");
                    }
                    ctm = resultantCtm;
                }
                //current CTM has not yet been saved to stack
                pendingCtm = true;
            }     
        return rects;
    }

Answer 1

您要查找的命令是cm。你看过The ABC of PDF with iText了吗？这本书还没有完成，但你已经可以下载前五章了。

这是显示cm运算符的表格的屏幕截图：

enter image description here

这是使用相同语法以完全相同的方式创建的5个形状的示例

：

由于图形状态的变化，它们被添加到不同的位置，即使是不同的大小和形状：坐标系被更改，形状在更改的坐标系中呈现。

是否有缩放矩形坐标的PDF命令？

1 个答案: