Question

我已经开始讨论快速DCT的实施问题。我已经找到了Loeffler算法，并且我已经用C ++和ARM组装实现了NEON。继续前进，我发现了避免浮动计算的binDCT。我的参考文件/架构是这样的：

那就是说，我已尝试使用以下代码在C ++中实现，只是为了测试：

void my_binDCT(int in[8][8], int data[8][8],const int xpos, const int ypos)
{
    int i;
    int row[8][8];

    int x0, x1, x2, x3, x4, x5, x6, x7;
    int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17;

    // transform rows 
    for (i = 0; i < 8; i++) {
        x0 = in[xpos + 0][ypos + i];
        x1 = in[xpos + 1][ypos + i];
        x2 = in[xpos + 2][ypos + i];
        x3 = in[xpos + 3][ypos + i];
        x4 = in[xpos + 4][ypos + i];
        x5 = in[xpos + 5][ypos + i];
        x6 = in[xpos + 6][ypos + i];
        x7 = in[xpos + 7][ypos + i];

        //stage 1
        tmp0 = x0 + x7;
        tmp7 = x0 - x7;
        tmp1 = x1 + x6; 
        tmp6 = x1 - x6;
        tmp2 = x2 + x5;
        tmp5 = x2 - x5;
        tmp3 = x3 + x4;
        tmp4 = x3 - x4;

        //stage 2
        tmp16 = ((tmp5*3)>>3) + tmp6;
        tmp15 = ((tmp16*5)>>3) - tmp5;

        //stage 3
        tmp10 = tmp0 + tmp3;
        tmp13 = tmp0 - tmp3;
        tmp11 = tmp1 + tmp2;
        tmp12 = tmp1 - tmp2;

        tmp14 = tmp4 + tmp15;
        tmp15 = tmp4 - tmp15;

        auto z = tmp16;
        tmp16 = tmp7 - tmp16;
        tmp17 = z + tmp7;

        //stage 4
        tmp14 = (tmp17 >> 3) - tmp14;

        tmp10 = tmp10 + tmp11;
        tmp11 = (tmp10 >> 1) - tmp11;

        tmp12 = ((tmp13*3)>>3) - tmp12;
        tmp13 = ((tmp12*3)>>3) + tmp13;

        tmp15 = ((tmp16*7)>>3) + tmp15;
        tmp16 = (tmp15>>1) - tmp16;


        //stage 5
        row[i][0] = tmp10;
        row[i][4] = tmp11;
        row[i][6] = tmp12;
        row[i][2] = tmp13;
        row[i][7] = tmp14;
        row[i][5] = tmp15;
        row[i][3] = tmp16;
        row[i][1] = tmp17;
    }

    //rotate columns
    /* transform columns */
    for (i = 0; i < 8; i++) {

        x0 = row[0][i];
        x1 = row[1][i];
        x2 = row[2][i];
        x3 = row[3][i];
        x4 = row[4][i];
        x5 = row[5][i];
        x6 = row[6][i];
        x7 = row[7][i];

        //stage 1
        tmp0 = x0 + x7;
        tmp7 = x0 - x7;
        tmp1 = x1 + x6; 
        tmp6 = x1 - x6;
        tmp2 = x2 + x5;
        tmp5 = x2 - x5;
        tmp3 = x3 + x4;
        tmp4 = x3 - x4;

        //stage 2
        tmp16 = ((tmp5*3)>>3) + tmp6;
        tmp15 = ((tmp16*5)>>3) - tmp5;

        //stage 3
        tmp10 = tmp0 + tmp3;
        tmp13 = tmp0 - tmp3;
        tmp11 = tmp1 + tmp2;
        tmp12 = tmp1 - tmp2;

        tmp14 = tmp4 + tmp15;
        tmp15 = tmp4 - tmp15;

        auto z = tmp16;
        tmp16 = tmp7 - tmp16;
        tmp17 = z + tmp7;

        //stage 4
        tmp14 = (tmp17 >> 3) - tmp14;

        tmp10 = tmp10 + tmp11;
        tmp11 = (tmp10 >> 1) - tmp11;

        tmp12 = ((tmp13*3)>>3) - tmp12;
        tmp13 = ((tmp12*3)>>3) + tmp13;

        tmp15 = ((tmp16*7)>>3) + tmp15;
        tmp16 = (tmp15>>1) - tmp16;

        //stage 5
        data[0][i] = tmp10 >> 3;
        data[4][i] = tmp11 >> 3;
        data[6][i] = tmp12 >> 3;
        data[2][i] = tmp13 >> 3;
        data[7][i] = tmp14 >> 3;
        data[5][i] = tmp15 >> 3;
        data[3][i] = tmp16 >> 3;
        data[1][i] = tmp17 >> 3;
    }
}

我按行编码了第一个DCT，按列编写了第二个DCT，并且我应该将结果归一化除以8（根据DCT公式，N = 8）。

我已经在8x8矩阵上进行了测试：

int matrix_a[8][8] = {
                        12, 16, 19, 12, 12, 27, 51, 47,

                        16, 24, 12, 19, 12, 20, 39, 51,

                        24, 27, 8,  39, 35, 34, 24, 44,

                        40, 17, 28, 32, 24, 27, 8,  32,

                        34, 20, 28, 20, 12, 8,  19, 34,

                        19, 39, 12, 27, 27, 12, 8,  34,

                        8,  28, -5, 39, 34, 16, 12, 19,

                        20, 27, 8,  27, 24, 19, 19, 8,
};

我得到了这个结果：

MYBINDCT-2: 

186 13 -3 4 -2 4 6 0 
-13 -20 -10 1 2 -2 1 -4 
1 19 -10 -3 7 -12 -2 -4 
5 2 -4 -3 -1 -4 -2 -1 
11 -5 -7 1 -3 4 -1 0 
-13 8 -3 0 10 -4 -6 3 
-11 6 -11 1 6 0 -1 -4 
-13 4 -1 -3 5 -5 -1 0

与（圆形）真实dct相距甚远：

186 20 -11 -9 -4 3 8 -1 
-18 -35 -24 -5 9 -3 0 -8 
14 26 -2 14 7 -19 -3 -3 
-9 -10 5 -15 1 8 3 1 
23 -11 -19 -9 -11 8 -2 1 
-10 10 3 -3 17 -4 -8 4 
-14 13 -21 -4 18 0 -1 -7 
-19 7 -1 8 15 -7 -3 0

我已经应用了算法，做了很多测试，但我仍然不明白我犯了哪些错误。

有没有比我更好的经验的人可以解释我已经犯过的错误吗？奇怪的是，正如我所写的那样，我已经实现了Loeffler，而且效果非常好。除了系数和浮点数之外，该过程非常相似（蝶式模式，浮动缩放因子，归一化）。我坚持了下来。感谢大家可以给我答案。

编辑：简短的电话是：

int main(int argc, char **argv)
{
    int MYBINDCT[8][8];
    my_binDCT(matrix_a, MYBINDCT, 0, 0);


    cout << "\nMYBINDCT: \n";
    for (int i = 0; i < 8; i++)
    {
        cout << '\n;
        for (int j = 0; j < 8; j++)
        {
            cout << MYBINDCT[i][j] << " ";
        }
    }
    return 0;
}

Answer 1

不具有乘数（或具有3或5的粗略乘数）的计算方案不能非常精确;我认为你的结果确实没问题。

如果您的论文有任何好处，则应指定预期的结果精确度。否则，42是8x8 DCT问题的一个非常普遍的答案，具有未指定的精度。

在对DCT进行近似时，通过更容易实现的东西替换DCT的定义是很常见的。如果使用DCT进行图像压缩，则只要您也相应地更改IDCT（逆变换），则将DCT的定义更改为任何变换将起作用。例如，H.264（视频编码标准）就是这样做的。

用于8x8矩阵的binDCT算法

1 个答案: