CUDA通用点积

时间:2014-01-03 18:05:03

标签: cuda dot-product

我是C和CUDA的新手,我正在编写点积函数,但它没有给我正确的结果。某种善意的灵魂会帮助我看看吗?

我也有两个问题,

  1. 为什么dot()无法正常工作,
  2. 在第57行,为什么它是产品[threadIdx.x]而不是产品[index]?我不能写

    product [index] = a [index] * b [index]; ... if(index == 0){...} 并以这种方式将每个元素与第零个线程相加?

  3. 非常感谢。

    DEVICEQUERY:

      Device 0: "GeForce GTX 570"
      CUDA Driver Version / Runtime Version          6.0 / 5.5
      CUDA Capability Major/Minor version number:    2.0
    

    Makefile:nvcc -arch = sm_20 cuda_test.cu -o cuda_test

    in cuda_test.cu:

    #include <stdio.h> // printf, scanf, NULL etc.
    #include <stdlib.h> // malloc, free, rand etc.
    
    #define N (3) //Number of threads we are using (also, length of array declared in main)
    
    #define THREADS_PER_BLOCK (1) //Threads per block we are using
    
    #define N_BLOCKS (N/THREADS_PER_BLOCK)
    
    /* Function to generate a random integer between 1-10 */
    void random_ints (int *a, int n)
    {
        int i;
        srand(time(NULL)); //Seed rand() with current time
        for(i=0; i<n; i++)
        { 
            a[i] = rand()%10 + 1; 
        }
        return;
    }
    
    /* Kernel that adds two integers a & b, stores result in c */
    __global__ void add(int *a, int *b, int *c) {
    //global indicates function that runs on 
    //device (GPU) and is called from host (CPU) code
    
        int index = threadIdx.x + blockIdx.x * blockDim.x;
    
        //threadIdx.x : thread index
        //blockIdx.x  : block index
        //blockDim.x  : threads per block
        //hence index is a thread counter across all blocks
        c[index] = a[index] + b[index];
    
    //note that pointers are used for variables
    //add() runs on device, so they must point to device memory
    //need to allocate memory on GPU
    }
    
    /* Kernel for dot product */
    __global__ void dot(int *a, int *b, int *c)
    {
        __shared__ int product[THREADS_PER_BLOCK]; //All threads in a block must be able 
                                                   //to access this array
    
        int index = threadIdx.x + blockIdx.x * blockDim.x; //index
    
        product[threadIdx.x] = a[index] * b[index]; //result of elementwise
                                                    //multiplication goes into product
    
        //Make sure every thread has finished
        __syncthreads();
    
        //Sum the elements serially to obtain dot product
        if( 0 == threadIdx.x ) //Pick one thread to sum, otherwise all will execute
        {
            int sum = 0;
            for(int j=0; j < THREADS_PER_BLOCK; j++) sum += product[j];
            //Done!
            atomicAdd(c,sum);
        }
    }
    
    int main(void)
    {
    
        int *a, *b, *c, *dotProduct; //host copies of a,b,c etc
        int *d_a, *d_b, *d_c, *d_dotProduct; //device copies of a,b,c etc
    
        int size = N * sizeof(int); //size of memory that needs to be allocated
    
        int i=0; //iterator
    
        //Allocate space for device copies of a,b,c
        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);
    
        //Setup input values
        a = (int *)malloc(size); random_ints(a,N);
        b = (int *)malloc(size); random_ints(b,N);
        c = (int *)malloc(size);
    
        //Copy inputs to device
        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    
        //Launch add() kernel on GPU
        add<<<N_BLOCKS,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);
        // triple angle brackets mark call from host to device
        // this is also known as a kernel launch
        // N/THREADS_PER_BLOCK = NO. OF BLOCKS
    
        //Copy result back to host
        cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
    
        //Output results
        printf("a = {");
        for (i=0; i<N; i++) printf(" %d",a[i]);
        printf(" }\n");
    
        printf("b = {");
        for (i=0; i<N; i++) printf(" %d",b[i]);
        printf(" }\n");
    
        printf("c = {");
        for (i=0; i<N; i++) printf(" %d",c[i]);
        printf(" }\n");
    
        //Calculate dot product of a & b
        dotProduct = (int *)malloc(sizeof(int)); //Allocate host memory to dotProduct
        *dotProduct = 0; //initialise to zero
        cudaMalloc((void **)&d_dotProduct, sizeof(int)); //Allocate device memory to d_dotProduct
        dot<<<N_BLOCKS,THREADS_PER_BLOCK>>>(d_a, d_b, d_dotProduct); //Perform calculation
        cudaMemcpy(dotProduct, d_dotProduct, sizeof(int), cudaMemcpyDeviceToHost); //Copy result into dotProduct
        printf("\ndot(a,b) = %d\n", *dotProduct); //Output result
    
        //Cleanup
        free(a); free(b); free(c); free(dotProduct);
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); cudaFree(d_dotProduct);
    
        return 0;
    } //End of main
    

2 个答案:

答案 0 :(得分:2)

正如talonmies所说,请使其成为其他人可以运行您的代码。嵌入行号是无益的。

没有其他信息的最佳猜测是您尚未将d_dotProduct初始化为零。您可以使用cudaMemset()执行此操作 - 如果您想要一个不同的初始值,那么您可以cudaMemcpy()来自主机的初始值或启动一个单独的内核来初始化,但在这种情况下cudaMemset()(等于主机上的memset()就足够了。

也可能N_BLOCKS*THREADS_PER_BLOCK不等于size

关于您的第二个问题,product是一个大小为THREADS_PER_BLOCK的每个块数组,如果您使用product[index]访问它,那么您将超出范围。

答案 1 :(得分:-2)

问题解决了!在总结'product'数组的各个元素之前需要设置“* c = 0”。

/* Kernel for dot product */
__global__ void dot(int *a, int *b, int *c)
{
    __shared__ int product[THREADS_PER_BLOCK]; //All threads in a block must be able 
                                               //to access this array

    int index = threadIdx.x + blockIdx.x * blockDim.x; //index

    product[threadIdx.x] = a[index] * b[index]; //result of elementwise
                                                //multiplication goes into product

    if(index==0) *c = 0; //Ask one thread to set c to zero.

    //Make sure every thread has finished
    __syncthreads();    

    //Sum the elements serially to obtain dot product
    if( 0 == threadIdx.x ) //Every block to do c += sum
    {
        int sum = 0;
        for(int j=0; j < THREADS_PER_BLOCK; j++) sum += product[j];
        //Done!
        atomicAdd(c,sum);
    }
}