我正在尝试通过优化缓存使用来优化单个处理器上的矩阵乘法。我实现了一个块乘法并使用了一些循环展开功能,但是我对如何进一步优化一无所知,尽管根据基准显然它仍然不是很理想。
矩阵按专业大列的顺序
。任何建议将不胜感激!
const int blockSize = 16;
int min(int a, int b) {
if (a < b)
return a;
else
return b;
}
void readInMatrix(double* inputMatrix, double* outMatrix, int beginX, int beginY, int endX, int endY, int blockSize, int n) {
int elementCount = blockSize*blockSize;
for (int i = beginX; i < endX; i++) {
for (int j = beginY; j < endY; j++) {
*outMatrix = *(inputMatrix + i * n + j);
outMatrix++;
}
for (int l = endY; l < beginY + blockSize; l++) {
*outMatrix = 0;
outMatrix++;
}
}
for (int n = endX; n < beginX + blockSize; n++) {
for (int m = 0; m < blockSize; m++) {
*outMatrix = 0;
outMatrix++;
}
}
outMatrix -= elementCount;
}
void writeMatrix(double* inputMatrix, double* outMatrix, int beginX, int beginY, int endX, int endY, int blockSize, int n) {
for (int i = beginX; i < endX; i++) {
for (int j = beginY; j < endY; j++) {
*(outMatrix + n * i + j) = *inputMatrix;
inputMatrix++;
}
inputMatrix = inputMatrix - (endY - beginY) + blockSize;
}
inputMatrix -= (endY - beginY) * (endX - beginX);
}
void square_dgemm(int n, double* A, double* B, double* C) {
if (n > blockSize) {
double ac[blockSize*blockSize];
double bc[blockSize*blockSize];
double cc[blockSize*blockSize];
for (int x = 0; x < n; x += blockSize) {
for (int y = 0; y < n; y += blockSize) {
readInMatrix(C, cc, x, y, min(x + blockSize, n), min(y + blockSize, n), blockSize, n);
for (int z = 0; z < n; z += blockSize) {
readInMatrix(A, ac, z, y, min(z + blockSize, n), min(y + blockSize, n), blockSize, n);
readInMatrix(B, bc, x, z, min(x + blockSize, n), min(z + blockSize, n), blockSize, n);
//for x
for (int i = 0; i < blockSize; i++) {
// for y
for (int j = 0; j < blockSize; j++) {
double cij = *(cc + i * blockSize + j);
for (int k = 0; k < blockSize; k+=4) {
cij += *(ac + j + blockSize * k) * *(bc + i * blockSize + k) + *(ac + j + blockSize * (k+1)) * *(bc + i * blockSize + k+1) + *(ac + j + blockSize * (k+2)) * *(bc + i * blockSize + k+2) + *(ac + j + b$
}
*(cc + i * blockSize + j) = cij;
}
}
}
writeMatrix(cc, C, x, y, min(x + blockSize, n), min(y + blockSize, n), blockSize, n);
}
}
}
else {
for (int i = 0; i < n; ++i)
/* For each column j of B */
for (int j = 0; j < n; ++j)
{
/* Compute C(i,j) */
double cij = C[i + j * n];
for (int k = 0; k < n; k++)
cij += A[i + k * n] * B[k + j * n];
C[i + j * n] = cij;
}
}
}
结果: 大小:31 Mflop / s:5612.97百分比:13.08 大小:32 Mflop / s:6313.5百分比:14.72 大小:96 Mflop / s:6666.09百分比:15.54 大小:97 Mflop / s:4480.09百分比:10.44 大小:127 Mflop / s:6520.47百分比:15.20 大小:128 Mflop / s:6674.19百分比:15.56 大小:129 Mflop / s:4731.75百分比:11.03 大小:191 Mflop / s:6470.42百分比:15.08 大小:192 Mflop / s:6524.03百分比:15.21 大小:229 Mflop / s:5791.24百分比:13.50 大小:255 Mflop / s:6469.74百分比:15.08 大小:256 Mflop / s:6216.15百分比:14.49 大小:257 Mflop / s:5616.37百分比:13.09 大小:319 Mflop / s:6522.59百分比:15.20 大小:320 Mflop / s:6281.06百分比:14.64 大小:321 Mflop / s:5796.06百分比:13.51 大小:417 Mflop / s:5982.09百分比:13.94 大小:479 Mflop / s:6422.19百分比:14.97 大小:480 Mflop / s:6502.51百分比:15.16 大小:511 Mflop / s:6407.95百分比:14.94 大小:512 Mflop / s:6407.58百分比:14.94 大小:639 Mflop / s:6460.55百分比:15.06 大小:640 Mflop / s:6173.96百分比:14.39 大小:767 Mflop / s:6346.06百分比:14.79 大小:768 Mflop / s:6490.66百分比:15.13 大小:769 Mflop / s:6039.4百分比:14.08 峰值的平均百分比= 14.3374