Question

我想通过展开循环来优化我的代码。我试图申请展开，但我认为我不能这样做，我看不出我的问题。我想将展开循环应用于外循环。

这个循环会对矩阵进行转置。

这是我应用展开循环的循环：

void transpose(int dim, int *src, int *dst) {
    for (i = 0; i < dim; i++)
        for (j = 0; j < dim; j++)
            dst[j * dim + i] = src[i * dim + j];
}

这是我的展开循环：

void transpose(int dim, int *src, int *dst) {
    int i = 0, j = 0, dimi = 0, dimj = 0, tempi = 0;

    for (i = 0; i < dim; i += 8) {
        for (j = 0; j < dim; j++) {
            dimj = j * dim + i;
            dimi = i * dim + j;
            dst[dimj] = src[dimi];

            tempi = i + 1;
            if (tempi < dim) {
                dimj = j * dim + tempi;
                dimi = tempi * dim + j;
                dst[dimj] = src[dimi];

                tempi += 1;
                if (tempi < dim) {
                    dimj = j * dim + tempi;
                    dimi = tempi * dim + j;
                    dst[dimj] = src[dimi];

                    tempi += 1;
                    if (tempi < dim) {
                        dimj = j * dim + tempi;
                        dimi = tempi * dim + j;
                        dst[dimj] = src[dimi];

                        tempi += 1;
                        if (tempi < dim) {
                            dimj = j * dim + tempi;
                            dimi = tempi * dim + j;
                            dst[dimj] = src[dimi];

                            tempi += 1;
                            if (tempi < dim) {
                                dimj = j * dim + tempi;
                                dimi = tempi * dim + j;
                                dst[dimj] = src[dimi];

                                tempi += 1;
                                if (tempi < dim) {
                                    dimj = j * dim + tempi;
                                    dimi = tempi * dim + j;
                                    dst[dimj] = src[dimi];

                                    tempi += 1;
                                    if (tempi < dim) {
                                        dimj = j * dim + tempi;
                                        dimi = tempi * dim + j;
                                        dst[dimj] = src[dimi];
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

Answer 1

我不确定您当前代码中的错误是什么，但这是另一种方法。

SELL

注意：通过引入如下变量可以减少乘法次数：

void transpose(int dim, int *src, int *dst) {
    int i, j;

    for (i = 0; i <= dim-8; i += 8)
    {
        for (j = 0; j < dim; j++)
        {
                dst[j * dim + (i+0)] = src[(i+0) * dim + j];
                dst[j * dim + (i+1)] = src[(i+1) * dim + j];
                dst[j * dim + (i+2)] = src[(i+2) * dim + j];
                dst[j * dim + (i+3)] = src[(i+3) * dim + j];
                dst[j * dim + (i+4)] = src[(i+4) * dim + j];
                dst[j * dim + (i+5)] = src[(i+5) * dim + j];
                dst[j * dim + (i+6)] = src[(i+6) * dim + j];
                dst[j * dim + (i+7)] = src[(i+7) * dim + j];
        }
    }

    // Use the normal loop for any remaining elements   
    for (; i < dim; i++)
        for (j = 0; j < dim; j++)
            dst[j * dim + i] = src[i * dim + j];
}

同样适用于RHS。

Answer 2

展开循环的全部目的是删除测试。您不对dim的值进行任何假设，因此您需要保留所有测试。我怀疑你会看到展开的代码有什么改进，但只有仔细的基准测试可以告诉你给定的编译器和架构是否有所作为。

有一件事是肯定的：它使代码更难以阅读，更容易搞砸。

如果您知道dim的最常见值，则可以尝试优化这些值。例如，如果你知道最常见的情况是3x3矩阵，你可以这样写：

void transpose(int dim, const int *src, int *dst) {
    if (dim == 3) {
        dst[0 * 3 + 0] = src[0 * 3 + 0];
        dst[0 * 3 + 1] = src[1 * 3 + 0];
        dst[0 * 3 + 2] = src[2 * 3 + 0];
        dst[1 * 3 + 0] = src[0 * 3 + 1];
        dst[1 * 3 + 1] = src[1 * 3 + 1];
        dst[1 * 3 + 2] = src[2 * 3 + 1];
        dst[2 * 3 + 0] = src[0 * 3 + 2];
        dst[2 * 3 + 1] = src[1 * 3 + 2];
        dst[2 * 3 + 2] = src[2 * 3 + 2];
    } else {
        for (int i = 0; i < dim; i++) {
            for (int j = 0; j < dim; j++) {
                dst[j * dim + i] = src[i * dim + j];
            }
        }
    }
}

现代编译器擅长优化简单的原始代码，利用硬件特定的矢量化功能。除非你确切地知道要优化什么以及什么时候进行优化，否则他们会做得比你做得好得多，而不会有虚假的错误。

Answer 3

以下是展开循环的示例。请注意，目标是删除条件语句和变量依赖项。此外，此代码尚未经过测试。

void transpose(int dim, int *src, int *dst) {
    // represent where the data is being read and where it is going
    int dstIndex = 0;
    int srcIndex = 0;

    // precalculate constants used within the loop
    int total = dim*dim;
    int unrolled = dim / 4;

    int dimx0 = dim*0;
    int dimx1 = dim*1;
    int dimx2 = dim*2;
    int dimx3 = dim*3;
    int dimx4 = dim*4;

    int i = 0;
    int j = 0;

    // since the matrix is being transposed i,j order doesn't matter as much
    // because one of the matrices will be accessed by column and the other
    // will be accessed by row (more effecient)
    for (j = 0; j < dim; j++) {
        for (i = 0; i < unrolled; i++) {
            // here the loop is being unrolled
            // notice that each statement does not rely on previous statements
            // and there is no conditional code
            dst[dstIndex + 0] = src[srcIndex + dimx0];
            dst[dstIndex + 1] = src[srcIndex + dimx1];
            dst[dstIndex + 2] = src[srcIndex + dimx2];
            dst[dstIndex + 3] = src[srcIndex + dimx3];
            dstIndex += 4;
            srcIndex += dimx4;
        }

        // the transpose was previously completed in larger blocks of 4
        // here whtever indices that were not transposed will be taken care of
        // e.g. if the matrix was 13x13, the above loop would run 3 times per row
        // and this loop would run once per row
        for (i = unrolled; i < dim; i++) {
            dst[dstIndex] = src[srcIndex];
            dstIndex += 1;
            srcIndex += dim;
        }

        // increment the source index
        srcIndex %= total;
        srcIndex += 1;
    }
}

嵌套循环在C中展开

3 个答案: