Question

我尝试使用带有平铺算法的共享内存来实现大矩阵乘法。当我执行此代码时，我得到了正确的结果。
但是当我尝试计算大于或等于4000 x 4000大小的矩阵时，它返回一个仅填充0的矩阵。

我找不到这段代码有什么问题..

下面的代码是矩阵乘法的核函数。

typedef float element;

#define TILE_WIDTH 32
#define WIDTH 4010
#define GRID_WIDTH ( (WIDTH) / TILE_WIDTH) + 1

__global__ void MatrixMulKernel(element* d_P, element* d_N, element* d_M, int Width) {
        __shared__ element ds_M[TILE_WIDTH][TILE_WIDTH];
        __shared__ element ds_N[TILE_WIDTH][TILE_WIDTH];

        int bx = blockIdx.x; int by = blockIdx.y;
        int tx = threadIdx.x; int ty = threadIdx.y;

        int Row = by*TILE_WIDTH + ty;
        int Col = bx*TILE_WIDTH + tx;
        element pValue = 0;

        int length = (Width + TILE_WIDTH - 1)/TILE_WIDTH;
        for(int m = 0; m < length; m++) {
                if( m*TILE_WIDTH + tx < WIDTH && Row < WIDTH) ds_N[ty][tx] = d_N[Row*Width + m*TILE_WIDTH+tx];
                else ds_N[ty][tx] = 0.0;
                if( m*TILE_WIDTH + ty < WIDTH && Col < WIDTH) ds_M[ty][tx] = d_M[Col + (m*TILE_WIDTH+ty)*Width];
                else ds_M[ty][tx] = 0.0;

                __syncthreads();

                for(int k = 0; k < TILE_WIDTH; ++k) {
                        pValue += ds_N[ty][k]*ds_M[k][tx];
                }

                __syncthreads();
        }

        if( Row < Width && Col < Width)
                d_P[Row*Width+Col] = pValue;
}

以下代码为完整代码。

#include <iostream>
#include <cstdlib>
#include <ctime>
#include <chrono>

using namespace std;
using namespace chrono;

typedef float element;

#define TILE_WIDTH 32
#define WIDTH 4010
#define GRID_WIDTH ( (WIDTH) / TILE_WIDTH) + 1

__global__ void MatrixMulKernel(element* d_P, element* d_N, element* d_M, int Width) {
        __shared__ element ds_M[TILE_WIDTH][TILE_WIDTH];
        __shared__ element ds_N[TILE_WIDTH][TILE_WIDTH];

        int bx = blockIdx.x; int by = blockIdx.y;
        int tx = threadIdx.x; int ty = threadIdx.y;

        int Row = by*TILE_WIDTH + ty;
        int Col = bx*TILE_WIDTH + tx;
        element pValue = 0;

        int length = (Width + TILE_WIDTH - 1)/TILE_WIDTH;
        for(int m = 0; m < length; m++) {
                if( m*TILE_WIDTH + tx < WIDTH && R

ow < WIDTH) ds_N[ty][tx] = d_N[Row*Width + m*TILE_WIDTH+tx];
                else ds_N[ty][tx] = 0.0;
                if( m*TILE_WIDTH + ty < WIDTH && Col < WIDTH) ds_M[ty][tx] = d_M[Col + (m*TILE_WIDTH+ty)*Width];
                else ds_M[ty][tx] = 0.0;

                __syncthreads();

                for(int k = 0; k < TILE_WIDTH; ++k) {
                        pValue += ds_N[ty][k]*ds_M[k][tx];
                }

                __syncthreads();
        }

        if( Row < Width && Col < Width)
                d_P[Row*Width+Col] = pValue;
}

void matmul(float* c, float* a, float* b, int width) {
        // c[y][x] = sum_k a[y][k] * b[k][x]
        // c[y * WIDTH + x] = sum_k a[y*WIDTH + k] * b[k*WIDTH + x]
        for (register int y = 0; y < width; ++y) {
                for (register int x = 0; x < width; ++x) {
                        register float sum = 0.0F;
                        for (register int k = 0; k < width; ++k) {
                                sum += a[y * width + k] * b[k * width + x];
                        }


     c[y * width + x] = sum;
                }
        }
}

void genData(element* ptr, unsigned int size) {
        while(size--) {
                ptr[size] = (rand() % 10) + 1;
        }
}

int main() {
        element *pA = NULL;
        element *pB = NULL;
        element *pC = NULL;

        system_clock::time_point start;
        system_clock::time_point  end;

        srand(time(NULL));
        start = system_clock::now();

        // malloc meroies on the host-side
        pA = (element*)malloc(WIDTH * WIDTH  * sizeof(element));
        pB = (element*)malloc(WIDTH * WIDTH  * sizeof(element));
        pC = (element*)malloc(WIDTH * WIDTH  * sizeof(element));

        genData(pA, WIDTH*WIDTH);
        genData(pB, WIDTH*WIDTH);

//device side

        element *pAdev = NULL;
        element *pBdev = NULL;
        element *pCdev = NULL;

        cudaMalloc((void**)&pAdev, WIDTH*WIDTH*sizeof(element));
        cudaMalloc((void**)&pBdev, WIDTH*WIDTH*sizeof(element));
        cudaMalloc((void**)&pCdev, WIDTH*WIDTH*sizeof(element));

        cudaMemcpy(pAdev, pA, WIDTH * WIDTH * sizeof(element), cudaMemcpyHostToDevice);
        cudaMemcpy(pBdev, pB, WIDTH * WIDTH * sizeof(element), cudaMemcpyHostToDevice);

        dim3 dimGrid(GRID_WIDTH, GRID_WIDTH, 1);
        dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

        MatrixMulKernel<<<dimGrid, dimBlock>>>(pCdev, pAdev, pBdev, WIDTH);

        cudaMemcpy(pC, pCdev, WIDTH * WIDTH * sizeof(element), cudaMemcpyDeviceToHost);


        element *test;
        test = (element*)malloc(WIDTH * WIDTH * sizeof(element));


     matmul(test, pA, pB, WIDTH);
        bool flag = true;
        for(int i = 0; i<WIDTH; i++) {
                for(int j = 0; j<WIDTH; j++) {
                        if(pC[i*WIDTH+j] != test[i*WIDTH+j]) {
                                printf("%f, %f\n", pC[i*WIDTH+j], test[i*WIDTH+j]);
                                flag = false;
                        }
                 }
        }

        if(flag) printf("TRUE\n");
        else printf("FALSE\n");

        end = system_clock::now();

        cout << "Elapsed time : " << duration_cast<nanoseconds>(end - start).count() << "ns" << '\n';

        free(pA);
        free(pB);
        free(pC);
        cudaFree(pAdev);
        cudaFree(pBdev);
        cudaFree(pCdev);

        return 0;
}

我在等你的回答！谢谢！

使用共享内存的CUDA my Tiled Matrix Multiplication返回特定矩阵大小的0值

0 个答案: