Question

我是CUDA的新手。我编写了一些简单的代码，它试图将随机初始化矩阵复制到设备内存，将每个矩阵条目的值递增1，然后将其传回主机内存。

编译或运行代码时没有错误。但是，似乎内核没有启动，因为启动内核后矩阵条目的值是相同的。

知道那里发生了什么吗？

#include <iostream>

using namespace std;

#define SIZE 2

void print_matrix (int size, float *array);
void matrix_initialize(int size, float *array);

__global__ void LU(float * m, int size){
m[threadIdx.y*size + threadIdx.x] ++ ;
}


int main(){
    srand(0);
    //variables
    float *a =  new float[SIZE*SIZE];
    dim3 blockdim(2,2,0);
    dim3 griddim(1,0,0);

    //initialize 
    matrix_initialize(SIZE, a);
    print_matrix (SIZE, a);


    //allocate space on device memory:
    float * Ad;
    int size = SIZE * SIZE;
    cudaMalloc ((void **)&Ad, size);

    //transfer data to device memory:
    cudaMemcpy(Ad , a, size, cudaMemcpyHostToDevice);

    //run the kernel
    LU<<<griddim,blockdim>>>(Ad, SIZE);


    // transfer the data back to the host memory
    cudaMemcpy(a , Ad, size, cudaMemcpyDeviceToHost);

    //test if the kernel runing the kernel has changed the value
    print_matrix (SIZE, a);


    // free device memory :
    cudaFree (Ad);



return 0;
}


void print_matrix (int size, float *array){
    for (int i=0; i < size*size ; i++){

        if(i % size == 0)
        cout << endl;
        cout << array [i] << "  ";

    }

}

void matrix_initialize(int size, float *array){

    for (int i = 0; i< SIZE*SIZE; i++){
            array[i] = rand()/(float) RAND_MAX;
    }
}

Answer 1

未使用的尺寸应设为1而不是0：

dim3 blockdim(2, 2, 1);
dim3 griddim(1, 1, 1);

您的代码启动2 x 2 x 0 = 0个块，每个1 x 0 x 0 = 0个线程。

您的尺码计算错误：

int size = SIZE * SIZE * sizeof(float);

您的代码不考虑数组元素大小。

CUDA内核发布

1 个答案: