Question

我的程序创建3个矩阵A，B和C。然后将A点B的点积存储在C中。我试图比较在CPU和GPU上执行此操作所需的时间。我编写了两个函数CPUDot和GPUDot。 CPUDot计算该函数中的点积，GPUDot调用内核函数。

该程序适用于相对较小的size值，但是一旦size在650-675左右，它将不起作用。内核仍然执行，并且仍然打印所花费的时间，但是它在减去（C，D，diff）时崩溃；

执行此操作时，我添加了一条printf语句，用于打印索引以查看其停止工作的地方，并且无法访问其中一个数组的第一个索引。

所有矩阵的大小均为m x n。

我不确定这是否是由于我特定的GPU的内存大小所致。我使用的是GTX 965m，我很确定它具有1024个CUDA内核和2gb VRam。

#include <stdio.h>

#define MAX_THREADS 1024
#define MAX_BLOCKS 65535

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void Dot(double *A, double *B, double *C, int m, int n, int h){

    int fi = blockIdx.x;
    int fm = gridDim.x;
    int si = threadIdx.x;
    int sm = blockDim.x;

    int index = fi * sm + si;
    int stride = fm * sm;
    //printf("Index: %d\nStride: %d\n", index, stride);
    for(int i = index; i < m*n; i += stride){
        // Pretend this is a two dimensional array. Get [x][y] parameters for C
        int x = i % n;
        int y = i / n;
        double counter = 0;
        for(int j = 0; j < h; ++j){
            // Pretending this is a t dimensional array get [x][y] parameters for A and B
            int ax = j;
            int ay = y;

            int bx = x;
            int by = j;

            // Convert [x][y] -> [i] since data is stored as single dimensional array for A and B
            int ai = ay * h + ax;
            int bi = by * n + bx;
            //printf("ai, bi, ci, %d, %d\n", ai, bi);
            double a = A[ai];
            double b = B[bi];
            counter += a * b;
        }
        C[i] = counter;
    }
}

class Matrix{
 public:
    int width;
    int height;
    double *elements;
    Matrix(int m, int n){
        this->width = n;
        this->height = m;
        gpuErrchk(cudaMallocManaged(&this->elements, m * n * sizeof(double)));
        for(int i = 0; i < m*n; ++i){
            this->elements[i] = i;
        }
    }
    ~Matrix(){
        gpuErrchk(cudaFree(this->elements));
    }
    void populate(double value){
        for(int i = 0; i < this->width * this->height; ++i){
            this->elements[i] = value;
        }
    }
    void print(){
        int i = 0;
        for(int y = 0; y < this->height; ++y){
            for(int x = 0; x < this->width; ++x){
                printf("%lf ", this->elements[i]);
                ++i;
            }
            printf("\n");
        }
    }
};

void GPUDot(Matrix &A, Matrix &B, Matrix &C){
    double *aData = A.elements;
    double *bData = B.elements;
    double *cData = C.elements;
    int threads = A.height;
    if(threads > MAX_THREADS){
        threads = MAX_THREADS;
    }
    int blocks = A.width;
    if(blocks > MAX_BLOCKS){
        blocks = MAX_BLOCKS;
    }
    Dot<<<1, 1>>>(aData, bData, cData, A.height, B.width, A.width);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
}

void CPUDot(const Matrix &A, const Matrix &B, Matrix &C){
    // Rectangle matricies are stored as single dimension array
    // Pretending it is stored as a two dimensional array x, y would be the values array[y][x]
    for(int y = 0; y < C.height; ++y){
        for(int x = 0; x < C.width; ++x){
            // Calculate the actual index given two dimensional parameters
            int ci = y * C.width + x;
            // Get new x, y parameters for the A matrix and B matrix
            int ay = y;
            int bx = x;
            C.elements[ci] = 0;
            for(int c = 0; c < A.width; ++c){
                int ax = c;
                int by = c;
                // Calculate the actual index for A and B given x, y
                int ai = ay * A.height + ax;
                int bi = by * B.width + bx;
                C.elements[ci] += A.elements[ai] * B.elements[bi];
            }
        }
    }
}

void subtract(const Matrix &A, const Matrix &B, Matrix &C){
    for(int i = 0; i < A.height * A.width; ++i){
        C.elements[i] = A.elements[i] - B.elements[i];
    }
}

double sum(const Matrix &A){
    double sum = 0;
    for(int i = 0; i < A.height * A.width; ++i){
        sum += A.elements[i];
    }
    return sum;
}

int main(){

    int size = 100;
    printf("Dotting two matricies of size: %d\n", size);
    Matrix A(size, size);
    Matrix B(size, size);
    Matrix C(size, size);
    Matrix D(size, size);
    Matrix diff(size, size);
    A.populate(1);
    B.populate(2);
    C.populate(0);
    time_t t;

    //A.print();
    //B.print();


    t = clock();
    CPUDot(A, B, C);
    t = clock() - t;
    printf("CPU Dotting took %lf ms\n", ((float)t / CLOCKS_PER_SEC) * 1000);
    //C.print();

    //A.populate(3);

    t = clock();
    GPUDot(A, B, D);
    t = clock() - t;
    printf("GPU Dotting took %lf ms\n", ((float)t / CLOCKS_PER_SEC) * 1000);

    //D.print();

    subtract(C, D, diff);
    printf("Subtracted the matricies\n");
    double error = sum(diff);

    printf("Error: %lf\n", error);

    return 0;
}

我知道这不是解决此问题的最佳方法-我应该利用Dot <<< 1，threads >>>（参数）中的第一个参数；但是我只是想看看它是否有效，我也不知道为什么不行。

编辑：我添加了正确的cuda错误检查，我更新了代码以显示所做的更改

我没有读到

的gpuassert

GPUassert：未指定的启动失败Matrix.cu 98

我还在another post上读到，“未指定的启动失败”几乎总是一个段错误，但是我假设如果我遇到索引问题，那么它不仅会在矩阵大小为太大了。我也将线程更改为1，所以方括号内的数字始终始终为<<< 1、1 >>>，因此索引和跨度分别始终为0、1

Cuda程序无法缩放

0 个答案: