Question

在主机端，我正在读取128 x 128整数数组，其随机值介于0-31之间。我有一个Occurrences数组，存储值0-31，然后在设备上尝试执行一个内核，该内核循环遍历128 x 128数组中的值，然后计算0-31出现的次数。

我在如何拆分CUDA中的块/线程以及如何使内核向主机提供通信并打印出每个元素的出现次数方面遇到问题。这是我第一次使用CUDA和我将不胜感激任何建设性的建议！到目前为止，这是我的代码：

 #include <stdio.h>
#include <stdlib.h>
#include <cuda.h>


#define MAXR 16
#define MAXC 16
#define N 256
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {

    int count = 0;
    //provide unique thread ID
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    int k;
    //for(k=0; k < 32;k++) {
    //  occurrences_d[k]=k;
//  }


    if(idx < N) {
        //for(k=0; k < MAXR*MAXC; k++) {
    for(int j=0; j<32; j++) {
            count =0;
        if(arrayONE_d[idx]==occurrences_d[j]){

            count+=1;
            occurrences_final_d[j] =count;
        }
        else {}


    }
    }
    //occurrences_final_d[0] = 77;
    }
}


int main(void) {



    //const int N = MAXR*MAXC;

    int arr1_h[MAXR][MAXC];
    //int *occurrences_h[0][32];
    //creating arrays for the device (GPU)
    //int *arr1_d;
    int occurrences_h[32];
    int *occurrences_d;

    int *occurrences_final_h[32] = {0};
    int *occurrences_final_d;

    int *arrayONE_h[256] = {0};
    int *arrayONE_d;

    int i, j;

    // allocating memory for the arrays on the device
    cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
    cudaMalloc( (void**) &occurrences_d,  32* sizeof(int));
    cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));

    /*
    for(i=0; i < 32; i++) {

        occurrences_h[i] = i;

    }
/*
 *
 */
    //Reading in matrix from .txt file and storing it in arr1 on the host (CPU)
    FILE *fp;
    fp =fopen("arrays16.txt","r");

     // this loop takes the information from .txt file and puts it into arr1 matrix
    for(i=0;i<MAXR;i++) {


        for(j=0;j<MAXC;j++)
        {
            fscanf(fp,"%d\t", &arr1_h[i][j]);
        }

    }

    for(i=0;i<MAXR;i++) {
        printf("\n");

        for(j=0;j<MAXC;j++) {
            //printf("d\t", arr1_h[i][j]);
        }

        printf("\n\n");
    }


    int x,y;
    int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
    for(x=0;x<MAXR;x++)
    {
        for(y=0;y<MAXC;y++)
        {
            //  printf("**%d   ",arr1_h[x][y]);

            arrayONE_h[z]= &arr1_h[x][y];
            z++;

        }
    }


    for(x=0; x < 256; x++) {
        printf("%d\n", *arrayONE_h[x]);
        //return 0;

    }

    int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);

    printf("\n\n");
    printf("**LENGTH = %d", length);

    // copying the arrays/memory from the host to the device (GPU)
    cudaMemcpy(arrayONE_d, &arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(occurrences_d, &occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(occurrences_final_d, &occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice);

    // how many blocks we will allocate
    //dim3 DimGrid();
    //how many threads per block we will allocate
    dim3 DimBlock(256);

    //kernel launch against the GPU
    count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);

    //copy the arrays post-computation from the device back to the host (CPU)
    cudaMemcpy(&occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost);

    // some error checking - run this with cuda-memcheck when executing your code
    cudaError_t errSync  = cudaGetLastError();
    cudaError_t errAsync = cudaDeviceSynchronize();
    if (errSync != cudaSuccess)
        printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
    if (errAsync != cudaSuccess)
        printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));

    //free up the memory of the device arrays
    cudaFree(arrayONE_d);
    cudaFree(occurrences_d);
    cudaFree(occurrences_final_d);

    //print out the number of occurrences of each 0-31 value
    for(i=0;i<32;i++) {
        printf("\n");

        printf("%d\n",occurrences_final_h[i]);

    }

}

Answer 1

正如我在评论中提到的那样，您对指针的理解存在缺陷。为了解决这个问题，我在代码中的许多地方进行了更改。我已经用注释// mod标记了其中的大多数，但我可能错过了一些。

此外，当多个线程可以更新同一位置时，内核根本无法跟踪元素。解决这个问题的一种方法是使用原子（我已经证明了。）还有其他各种方法，例如并行归约，但是这些都不是对内核的琐碎改动。另外，您的内核逻辑在几种方面已被破坏。

接下来的工作是，我可以对您的代码进行最少的修改，以使它们变得明智。您可以使用一些编译开关来探索不同的内核行为：

无开关-靠近您的内核，但无法正常工作
-DUSE_ATOMICS将演示对内核的修改，以使其能够正确计数。
-DUSE_ALT_KERNEL探索了另一种内核逻辑方法：为每个直方图bin分配一个线程，并使每个线程遍历整个数组，并跟踪属于该bin的元素。由于只有一个线程正在向每个bin结果写入，因此不需要原子。但是，我们只能拥有与bin一样多的线程（使用这种简单的实现）。在没有太多困难的情况下，此方法可以扩展为每个bin using warp shuffle to do a final warp-level reduction一次扭曲，然后让一个线程将最终结果写入bin。这将在某种程度上提高内存访问效率。但是，这也将复杂性引入了您可能尚未了解的内核。

代码如下：

$ cat t316.cu
 #include <stdio.h>
#include <stdlib.h>
#include <cuda.h>


#define MAXR 16
#define MAXC 16
#define BINS 32
#define N (MAXR*MAXC)
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {

    //provide unique thread ID
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
#ifndef USE_ALT_KERNEL
    if(idx < N) {
        //for(k=0; k < MAXR*MAXC; k++) {
    for(int j=0; j<32; j++) {
        if(arrayONE_d[idx]==occurrences_d[j]){
#ifndef USE_ATOMICS
            occurrences_final_d[j]++;
#else
         atomicAdd(occurrences_final_d+j, 1);
#endif

        }
        else {}


    }
    }
#else
   // use one thread per histo bin
   if (idx < BINS){
     int count = 0;
     int myval = occurrences_d[idx];
     for (int i = 0; i < N; i++) if (arrayONE_d[i] == myval) count++;
     occurrences_final_d[idx] = count;
     }

#endif
    }


int main(void) {



    //const int N = MAXR*MAXC;

    int arr1_h[MAXR][MAXC];
    //int *occurrences_h[0][32];
    //creating arrays for the device (GPU)
    //int *arr1_d;
    int occurrences_h[32]; // mod
    int *occurrences_d;

    int occurrences_final_h[32] = {0};  // mod
    int *occurrences_final_d;

    int arrayONE_h[256] = {0};  // mod
    int *arrayONE_d;

    int i, j;

    // allocating memory for the arrays on the device
    cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
    cudaMalloc( (void**) &occurrences_d,  32* sizeof(int));
    cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));

    /*
    for(i=0; i < 32; i++) {

        occurrences_h[i] = i;

    }
 */
    //Reading in matrix from .txt file and storing it in arr1 on the host (CPU)

//    FILE *fp;
//    fp =fopen("arrays16.txt","r");

     // this loop takes the information from .txt file and puts it into arr1 matrix
    for(i=0;i<MAXR;i++) {


        for(j=0;j<MAXC;j++)
        {
//            fscanf(fp,"%d\t", &arr1_h[i][j]);
              arr1_h[i][j] = j;  // mod
        }

    }

    for(i=0;i<MAXR;i++) {

        for(j=0;j<MAXC;j++) {
            //printf("d\t", arr1_h[i][j]);
        }

    }


    int x,y;
    int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
    for(x=0;x<MAXR;x++)
    {
        for(y=0;y<MAXC;y++)
        {
            //  printf("**%d   ",arr1_h[x][y]);

            arrayONE_h[z]= arr1_h[x][y];  // mod
            z++;

        }
    }


    for(x=0; x < 256; x++) {
//        printf("%d\n", arrayONE_h[x]);  // mod
        //return 0;

    }

    int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);

    printf("**LENGTH = %d\n", length);

    // copying the arrays/memory from the host to the device (GPU)
    cudaMemcpy(arrayONE_d, arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice);  //mod
    cudaMemcpy(occurrences_d, occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice);   // mod
    cudaMemcpy(occurrences_final_d, occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice); // mod

    // how many blocks we will allocate
    //dim3 DimGrid();
    //how many threads per block we will allocate
#ifndef USE_ALT_KERNEL
    dim3 DimBlock(N);
#else
    dim3 DimBlock(BINS);
#endif
    //kernel launch against the GPU
    count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);

    //copy the arrays post-computation from the device back to the host (CPU)
    cudaMemcpy(occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost); // mod
    cudaMemcpy(occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost);  // mod

    // some error checking - run this with cuda-memcheck when executing your code
    cudaError_t errSync  = cudaGetLastError();
    cudaError_t errAsync = cudaDeviceSynchronize();
    if (errSync != cudaSuccess)
        printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
    if (errAsync != cudaSuccess)
        printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));

    //free up the memory of the device arrays
    cudaFree(arrayONE_d);
    cudaFree(occurrences_d);
    cudaFree(occurrences_final_d);

    //print out the number of occurrences of each 0-31 value
    for(i=0;i<32;i++) {
        printf("%d ",occurrences_final_h[i]);

    }
    printf("\n");
}
$ nvcc -o t316 t316.cu
$ cuda-memcheck ./t316
========= CUDA-MEMCHECK
**LENGTH = 256
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
========= ERROR SUMMARY: 0 errors
$ nvcc -o t316 t316.cu -DUSE_ATOMICS
$ ./t316
**LENGTH = 256
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
$ nvcc -o t316 t316.cu -DUSE_ALT_KERNEL
$ cuda-memcheck ./t316
========= CUDA-MEMCHECK
**LENGTH = 256
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
========= ERROR SUMMARY: 0 errors
$

在上面的输出中，我们看到基本内核产生了错误的结果。原子核和备用核产生正确的结果

（您的代码已修改为使用合成数据，因此不需要打开文件。）

CUDA +使用C

1 个答案: