gpugems3中的前缀扫描CUDA示例代码是否正确?

时间:2015-06-14 16:52:18

标签: cuda gpu nvidia prefix-sum

我已经编写了一段代码来调用gpugem3

中的内核

但我得到的结果是一堆负数而不是前缀扫描。我想知道我的内核调用是错还是gpugem3代码有问题?

这是我的代码:

#include <stdio.h>
#include <sys/time.h>
#include <cuda.h>



__global__ void kernel(int *g_odata, int  *g_idata, int n, int dim)
{
     extern __shared__ int temp[];// allocated on invocation
    int thid = threadIdx.x;
    int offset = 1;

    temp[2*thid] = g_idata[2*thid]; // load input into shared memory
    temp[2*thid+1] = g_idata[2*thid+1];
    for (int d = n>>1; d > 0; d >>= 1) // build sum in place up the tree
    {
    __syncthreads();
    if (thid < d)
    {
    int ai = offset*(2*thid+1)-1;
    int bi = offset*(2*thid+2)-1;
    temp[bi] += g_idata[ai];
    }
    offset *= 2;
    }
    if (thid == 0) { temp[n - 1] = 0; } // clear the last element
    for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
    {
    offset >>= 1;
    __syncthreads();
    if (thid < d)
    {
    int ai = offset*(2*thid+1)-1;
    int bi = offset*(2*thid+2)-1;
    int t = temp[ai];
    temp[ai] = temp[bi];
    temp[bi] += t;
    }
    }
    __syncthreads();
    g_odata[2*thid] = temp[2*thid]; // write results to device memory
    g_odata[2*thid+1] = temp[2*thid+1];

}


void Initialize(int  *h_in,int num_items)
{

   int j;
   for(j=0;j<num_items;j++)

       h_in[j]=j;
       printf(" input: ");
         printf("\n\n");



}


int main(int argc, char** argv)
{
    int num_items = 512;


    int*  h_in = new int[num_items];


    // Initialize problem 
    Initialize(h_in, num_items);


    int *d_in = NULL;
    cudaMalloc((void**)&d_in, sizeof(int) * num_items);


if(cudaSuccess!=    cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)) fprintf(stderr,"could not copy to gpu");

    // Allocate device output array
    int *d_out = NULL;
    cudaMalloc((void**)&d_out, sizeof(int) * (num_items+1));


    kernel<<<1,256,num_items*sizeof(int)>>>(d_out, d_in,num_items, 2);

     int* h_out= new int[num_items+1];
    if( cudaSuccess !=cudaMemcpy(h_out,d_out,sizeof(int)*(num_items+1),cudaMemcpyDeviceToHost))fprintf(stderr,"could not copy back");
    int i;
    printf(" \n");
    for(i=0;i<num_items;i++)
    printf(" ,%d ",h_out[i]);
    // Cleanup
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (d_in) cudaFree(d_in);
    if (d_out) cudaFree(d_out);

    printf("\n\n");

    return 0;
}

1 个答案:

答案 0 :(得分:4)

您似乎在将GPU Gems 3 chapter中的代码转录到内核中时至少出现了1次错误。这一行不正确:

temp[bi] += g_idata[ai];

它应该是:

temp[bi] += temp[ai];

当我对您现在发布的代码进行一次更改时,似乎为我打印出正确的(独占扫描)前缀总和。我还会提到其他一些事情:

  1. 即使没有这种改变,我也会得到一些接近正确的结果。因此,如果您获得了大量不同的内容(例如负数),则可能是您的计算机设置或CUDA安装存在问题。我建议使用比你现在更严格的cuda error checking(尽管在你的一张支票中应该显示机器设置问题。)

  2. 精心制作的例程会有一些限制。它只能在单个线程块中使用,它在共享内存访问时会产生冲突,并且数据集大小会受限于单个线程块可以处理的内容(此例程每个线程产生两个输出元素,因此数据集大小预计等于线程数的两倍)。如前所述,动态共享内存分配需要与数据集大小一样大(即,线程大小的两倍,元素数量)。

  3. 这可能对学习很有用,但如果你想要一个健壮,快速的前缀扫描,建议你使用thrustcub的例程而不是你自己的代码,甚至如果来自这篇(旧)文章。

  4. 以下代码与您的代码类似,但它修复了上述问题,并且我已经将内核模板化以用于各种数据类型:

    #include <stdio.h>
    #define DSIZE 512
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    
    typedef int mytype;
    
    template <typename T>
    __global__ void prescan(T *g_odata, T *g_idata, int n)
    {
      extern __shared__ T temp[];  // allocated on invocation
      int thid = threadIdx.x;
      int offset = 1;
      temp[2*thid] = g_idata[2*thid]; // load input into shared memory
      temp[2*thid+1] = g_idata[2*thid+1];
      for (int d = n>>1; d > 0; d >>= 1)                    // build sum in place up the tree
      {
        __syncthreads();
        if (thid < d)
        {
          int ai = offset*(2*thid+1)-1;
          int bi = offset*(2*thid+2)-1;
          temp[bi] += temp[ai];
        }
        offset *= 2;
      }
      if (thid == 0) { temp[n - 1] = 0; } // clear the last element
      for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
        {
          offset >>= 1;
          __syncthreads();
          if (thid < d)
          {
             int ai = offset*(2*thid+1)-1;
             int bi = offset*(2*thid+2)-1;
             T t = temp[ai];
             temp[ai] = temp[bi];
             temp[bi] += t;
          }
        }
      __syncthreads();
      g_odata[2*thid] = temp[2*thid]; // write results to device memory
      g_odata[2*thid+1] = temp[2*thid+1];
    }
    
    int main(){
    
      mytype *h_i, *d_i, *h_o, *d_o;
      int dszp = (DSIZE)*sizeof(mytype);
    
      h_i = (mytype *)malloc(dszp);
      h_o = (mytype *)malloc(dszp);
      if ((h_i == NULL) || (h_o == NULL)) {printf("malloc fail\n"); return 1;}
      cudaMalloc(&d_i, dszp);
      cudaMalloc(&d_o, dszp);
      cudaCheckErrors("cudaMalloc fail");
      for (int i = 0 ; i < DSIZE; i++){
        h_i[i] = i;
        h_o[i] = 0;}
      cudaMemset(d_o, 0, dszp);
      cudaCheckErrors("cudaMemset fail");
      cudaMemcpy(d_i, h_i, dszp, cudaMemcpyHostToDevice);
      cudaCheckErrors("cudaMemcpy 1 fail");
      prescan<<<1,DSIZE/2, dszp>>>(d_o, d_i, DSIZE);
      cudaDeviceSynchronize();
      cudaCheckErrors("kernel fail");
      cudaMemcpy(h_o, d_o, dszp, cudaMemcpyDeviceToHost);
      cudaCheckErrors("cudaMemcpy 2 fail");
      mytype psum = 0;
      for (int i =1; i < DSIZE; i++){
        psum += h_i[i-1];
        if (psum != h_o[i]) {printf("mismatch at %d, was: %d, should be: %d\n", i, h_o[i], psum); return 1;}
        }
      return 0;
    }