我尝试使用带有平铺算法的共享内存来实现大矩阵乘法。当我执行此代码时,我得到了正确的结果。
但是当我尝试计算大于或等于4000 x 4000大小的矩阵时,它返回一个仅填充0的矩阵。
我找不到这段代码有什么问题..
下面的代码是矩阵乘法的核函数。
typedef float element;
#define TILE_WIDTH 32
#define WIDTH 4010
#define GRID_WIDTH ( (WIDTH) / TILE_WIDTH) + 1
__global__ void MatrixMulKernel(element* d_P, element* d_N, element* d_M, int Width) {
__shared__ element ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ element ds_N[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int Row = by*TILE_WIDTH + ty;
int Col = bx*TILE_WIDTH + tx;
element pValue = 0;
int length = (Width + TILE_WIDTH - 1)/TILE_WIDTH;
for(int m = 0; m < length; m++) {
if( m*TILE_WIDTH + tx < WIDTH && Row < WIDTH) ds_N[ty][tx] = d_N[Row*Width + m*TILE_WIDTH+tx];
else ds_N[ty][tx] = 0.0;
if( m*TILE_WIDTH + ty < WIDTH && Col < WIDTH) ds_M[ty][tx] = d_M[Col + (m*TILE_WIDTH+ty)*Width];
else ds_M[ty][tx] = 0.0;
__syncthreads();
for(int k = 0; k < TILE_WIDTH; ++k) {
pValue += ds_N[ty][k]*ds_M[k][tx];
}
__syncthreads();
}
if( Row < Width && Col < Width)
d_P[Row*Width+Col] = pValue;
}
以下代码为完整代码。
#include <iostream>
#include <cstdlib>
#include <ctime>
#include <chrono>
using namespace std;
using namespace chrono;
typedef float element;
#define TILE_WIDTH 32
#define WIDTH 4010
#define GRID_WIDTH ( (WIDTH) / TILE_WIDTH) + 1
__global__ void MatrixMulKernel(element* d_P, element* d_N, element* d_M, int Width) {
__shared__ element ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ element ds_N[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int Row = by*TILE_WIDTH + ty;
int Col = bx*TILE_WIDTH + tx;
element pValue = 0;
int length = (Width + TILE_WIDTH - 1)/TILE_WIDTH;
for(int m = 0; m < length; m++) {
if( m*TILE_WIDTH + tx < WIDTH && R
ow < WIDTH) ds_N[ty][tx] = d_N[Row*Width + m*TILE_WIDTH+tx];
else ds_N[ty][tx] = 0.0;
if( m*TILE_WIDTH + ty < WIDTH && Col < WIDTH) ds_M[ty][tx] = d_M[Col + (m*TILE_WIDTH+ty)*Width];
else ds_M[ty][tx] = 0.0;
__syncthreads();
for(int k = 0; k < TILE_WIDTH; ++k) {
pValue += ds_N[ty][k]*ds_M[k][tx];
}
__syncthreads();
}
if( Row < Width && Col < Width)
d_P[Row*Width+Col] = pValue;
}
void matmul(float* c, float* a, float* b, int width) {
// c[y][x] = sum_k a[y][k] * b[k][x]
// c[y * WIDTH + x] = sum_k a[y*WIDTH + k] * b[k*WIDTH + x]
for (register int y = 0; y < width; ++y) {
for (register int x = 0; x < width; ++x) {
register float sum = 0.0F;
for (register int k = 0; k < width; ++k) {
sum += a[y * width + k] * b[k * width + x];
}
c[y * width + x] = sum;
}
}
}
void genData(element* ptr, unsigned int size) {
while(size--) {
ptr[size] = (rand() % 10) + 1;
}
}
int main() {
element *pA = NULL;
element *pB = NULL;
element *pC = NULL;
system_clock::time_point start;
system_clock::time_point end;
srand(time(NULL));
start = system_clock::now();
// malloc meroies on the host-side
pA = (element*)malloc(WIDTH * WIDTH * sizeof(element));
pB = (element*)malloc(WIDTH * WIDTH * sizeof(element));
pC = (element*)malloc(WIDTH * WIDTH * sizeof(element));
genData(pA, WIDTH*WIDTH);
genData(pB, WIDTH*WIDTH);
//device side
element *pAdev = NULL;
element *pBdev = NULL;
element *pCdev = NULL;
cudaMalloc((void**)&pAdev, WIDTH*WIDTH*sizeof(element));
cudaMalloc((void**)&pBdev, WIDTH*WIDTH*sizeof(element));
cudaMalloc((void**)&pCdev, WIDTH*WIDTH*sizeof(element));
cudaMemcpy(pAdev, pA, WIDTH * WIDTH * sizeof(element), cudaMemcpyHostToDevice);
cudaMemcpy(pBdev, pB, WIDTH * WIDTH * sizeof(element), cudaMemcpyHostToDevice);
dim3 dimGrid(GRID_WIDTH, GRID_WIDTH, 1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
MatrixMulKernel<<<dimGrid, dimBlock>>>(pCdev, pAdev, pBdev, WIDTH);
cudaMemcpy(pC, pCdev, WIDTH * WIDTH * sizeof(element), cudaMemcpyDeviceToHost);
element *test;
test = (element*)malloc(WIDTH * WIDTH * sizeof(element));
matmul(test, pA, pB, WIDTH);
bool flag = true;
for(int i = 0; i<WIDTH; i++) {
for(int j = 0; j<WIDTH; j++) {
if(pC[i*WIDTH+j] != test[i*WIDTH+j]) {
printf("%f, %f\n", pC[i*WIDTH+j], test[i*WIDTH+j]);
flag = false;
}
}
}
if(flag) printf("TRUE\n");
else printf("FALSE\n");
end = system_clock::now();
cout << "Elapsed time : " << duration_cast<nanoseconds>(end - start).count() << "ns" << '\n';
free(pA);
free(pB);
free(pC);
cudaFree(pAdev);
cudaFree(pBdev);
cudaFree(pCdev);
return 0;
}
我在等你的回答!谢谢!