如何进一步缓存优化矩阵乘法算法?

时间:2019-02-16 00:57:16

标签: c matrix-multiplication

我正在尝试通过优化缓存使用来优化单个处理器上的矩阵乘法。我实现了一个块乘法并使用了一些循环展开功能,但是我对如何进一步优化一无所知,尽管根据基准显然它仍然不是很理想。

矩阵按专业大列的顺序

任何建议将不胜感激!

const int blockSize = 16;

int min(int a, int b) {
        if (a < b)
                return a;
        else
                return b;
}

void readInMatrix(double* inputMatrix, double* outMatrix, int beginX, int beginY, int endX, int endY, int blockSize, int n) {

        int elementCount = blockSize*blockSize;
        for (int i = beginX; i < endX; i++) {

                for (int j = beginY; j < endY; j++) {

                        *outMatrix = *(inputMatrix + i * n + j);
                        outMatrix++;
                }


                for (int l = endY; l < beginY + blockSize; l++) {
                        *outMatrix = 0;
                        outMatrix++;
                }


        }

    for (int n = endX; n < beginX + blockSize; n++) {
                for (int m = 0; m < blockSize; m++) {
                        *outMatrix = 0;
                        outMatrix++;
                }

        }
    outMatrix -= elementCount;

}



void writeMatrix(double* inputMatrix, double* outMatrix, int beginX, int beginY, int endX, int endY, int blockSize, int n) {

        for (int i = beginX; i < endX; i++) {

                for (int j = beginY; j < endY; j++) {

                        *(outMatrix + n * i + j) = *inputMatrix;
                        inputMatrix++;
                }

                inputMatrix = inputMatrix - (endY - beginY) + blockSize;

        }

    inputMatrix -= (endY - beginY) * (endX - beginX);


}

void square_dgemm(int n, double* A, double* B, double* C) {

        if (n > blockSize) {

        double ac[blockSize*blockSize];
        double bc[blockSize*blockSize];
        double cc[blockSize*blockSize];



                for (int x = 0; x < n; x += blockSize) {
                        for (int y = 0; y < n; y += blockSize) {
                                readInMatrix(C, cc, x, y, min(x + blockSize, n), min(y + blockSize, n), blockSize, n);






                                for (int z = 0; z < n; z += blockSize) {

                                        readInMatrix(A, ac, z, y, min(z + blockSize, n), min(y + blockSize, n), blockSize, n);

                                        readInMatrix(B, bc, x, z, min(x + blockSize, n), min(z + blockSize, n), blockSize, n);



                                                //for x
        for (int i = 0; i < blockSize; i++) {
                // for y
                for (int j = 0; j < blockSize; j++) {
                        double cij = *(cc + i * blockSize + j);
                        for (int k = 0; k < blockSize; k+=4) {
                                cij += *(ac + j +  blockSize * k) * *(bc + i * blockSize + k) + *(ac + j +  blockSize * (k+1)) * *(bc + i * blockSize + k+1) + *(ac + j +  blockSize * (k+2)) * *(bc + i * blockSize + k+2) + *(ac + j +  b$

                        }
                        *(cc + i * blockSize + j) = cij;

                }

        }


                                }

                                writeMatrix(cc, C, x, y, min(x + blockSize, n), min(y + blockSize, n), blockSize, n);

                        }

                }

        }
    else {

                for (int i = 0; i < n; ++i)
                        /* For each column j of B */
                        for (int j = 0; j < n; ++j)
                        {
                                /* Compute C(i,j) */
                                double cij = C[i + j * n];
                                for (int k = 0; k < n; k++)
                                        cij += A[i + k * n] * B[k + j * n];
                                C[i + j * n] = cij;
                        }


        }


}

结果: 大小:31 Mflop / s:5612.97百分比:13.08 大小:32 Mflop / s:6313.5百分比:14.72 大小:96 Mflop / s:6666.09百分比:15.54 大小:97 Mflop / s:4480.09百分比:10.44 大小:127 Mflop / s:6520.47百分比:15.20 大小:128 Mflop / s:6674.19百分比:15.56 大小:129 Mflop / s:4731.75百分比:11.03 大小:191 Mflop / s:6470.42百分比:15.08 大小:192 Mflop / s:6524.03百分比:15.21 大小:229 Mflop / s:5791.24百分比:13.50 大小:255 Mflop / s:6469.74百分比:15.08 大小:256 Mflop / s:6216.15百分比:14.49 大小:257 Mflop / s:5616.37百分比:13.09 大小:319 Mflop / s:6522.59百分比:15.20 大小:320 Mflop / s:6281.06百分比:14.64 大小:321 Mflop / s:5796.06百分比:13.51 大小:417 Mflop / s:5982.09百分比:13.94 大小:479 Mflop / s:6422.19百分比:14.97 大小:480 Mflop / s:6502.51百分比:15.16 大小:511 Mflop / s:6407.95百分比:14.94 大小:512 Mflop / s:6407.58百分比:14.94 大小:639 Mflop / s:6460.55百分比:15.06 大小:640 Mflop / s:6173.96百分比:14.39 大小:767 Mflop / s:6346.06百分比:14.79 大小:768 Mflop / s:6490.66百分比:15.13 大小:769 Mflop / s:6039.4百分比:14.08 峰值的平均百分比= 14.3374

0 个答案:

没有答案