CUDA中共享内存的原子操作

时间:2011-10-10 07:19:24

标签: cuda atomic

我使用GTX 280,它具有计算能力1.3并支持共享内存上的原子操作。我正在使用cuda SDK 2.2和VS 2005.在我的程序中,我必须广泛使用原子操作,因为没有别的办法。

一个例子是我必须计算一个数组的运行总和,并找出总和超过给定截止值的索引。为此,我使用scan算法的变体,并使用atomicMin存储索引,同时值小于阈值。所以这种方式最后共享内存的索引值就小于阈值。

这只是内核的一个组件,内核调用中有许多类似的代码块。

我有3个问题

  1. 首先我无法编译代码,因为它说原子操作没有定义,我搜索但没找到我必须添加的文件。
  2. 其次,我设法通过在CUDA SDK提供的代码中复制代码来编译代码,但后来它说共享内存不支持原子操作,因为它在以下{{3}中运行}
  3. 即使我在命令行编译中使用-arch sm_12来解决问题,使用这些原子操作的代码片段也花费了大量时间。
  4. 我相信在最坏的情况下我应该加快速度,因为原子操作并不多,我使用1块16x16。不幸的是,串行代码运行速度提高了10倍。



    下面我发布内核cod *,这个内核调用似乎是瓶颈,如果有人可以帮助我优化然后它会很好。串行代码只是以串行方式执行这些操作。我正在使用16 X 16的块配置。

    代码似乎很冗长,但实际上它包含一个if代码块,而代码块执行几乎相同的任务,但它们无法合并。

    #define limit (int)(log((float)256)/log((float)2))
    
    // This receives a pointer to an image, some variables and 4 more arrays cont(of size 256) vars(some constants), lim and buf(of image size)
    // block configuration 1 block of 16x16
    
    __global__ void kernel_Main(unsigned char* in, int height,int width, int bs,int th, double cutoff, uint* cont,int* vars, unsigned int* lim,unsigned int* buf)
    {  
    
        int j = threadIdx.x;
        int i = threadIdx.y;
    
        int k = i*blockDim.x+j;
    
    
        __shared__  int prefix_sum[256];  
        __shared__  int sum_s[256];
        __shared__  int ary_shared[256];
        __shared__  int he_shared[256];
    
        // this is the threshold
        int cutval = (2*width*height)*cutoff;
        prefix_sum[k] = cont[k];
    
        int l;
        // a variant of scan algorithm 
        for(l=0;l<=limit;l++)
        {
            sum_s[k]=prefix_sum[k];
    
            if(k >= (int)pow((float)2,(float)l))
            {  
                prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                // Find out the minimum index for which the cummulative sum crosses threshold
                if(prefix_sum[k] > cutval)
                {
                    atomicMin(&vars[cut],k);
                }
            }
            __syncthreads();
        }
    
        // The first thread will store the value in global array
        if(k==0)
        {
            vars[cuts]=prefix_sum[vars[cut]];
        }
        __syncthreads();
    
    
        if(vars[n])
        {
            // bs = 7 in this case
            if(i<bs && j<bs)
            {
                // using atomic add because the index could be same for 2 different threads
                atomicAdd(&ary_shared[in[i*(width) + j]],1);  
            }
            __syncthreads();
    
    
            int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
            prefix_sum[k] = ary_shared[k];
            sum_s[k] = 0;
    
            // Again prefix sum
    
            int l;
            for(l=0;l<=limit;l++)
            {
                sum_s[k]=prefix_sum[k];
    
                if(k >= (int)pow((float)2,(float)l))
                {  
                    prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                    // Find out the minimum index for which the cummulative sum crosses threshold
                    if(prefix_sum[k] > minth)
                    {
                        atomicMin(&vars[hmin],k);
                    }
                }
                __syncthreads();
            }
    
             // set the maximum value here
            if(k==0)
            {
                vars[hminc]=prefix_sum[255];
                // because we will always overshoot by 1
                vars[hmin]--;
            }
    
            __syncthreads();
    
            int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
            prefix_sum[k] = ary_shared[255-k];
    
            for(l=0;l<=limit;l++)
            {
                sum_s[k]=prefix_sum[k];
    
                if(k >= (int)pow((float)2,(float)l))
                {  
                    prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                    // Find out the minimum index for which the cummulative sum crosses threshold
                    if(prefix_sum[k] > maxth)
                    {
                        atomicMin(&vars[hmax], k);
                    }
                }
                 __syncthreads();
            }
             // set the maximum value here
    
             if(k==0)
             {
                vars[hmaxc]=prefix_sum[255];
                vars[hmax]--;
                vars[hmax]=255-vars[hmax];
    
             }
            __syncthreads();
    
    
    
            int rng = vars[hmax] - vars[hmin];
            if(rng >= vars[cut])
            {
              if( k <= vars[hmin] )
                    he_shared[k] = 0;
              else if( k >= vars[hmax])
                    he_shared[k] = 255;
              else
                    he_shared[k] = (255 * (k - vars[hmin])) / rng;
            }
             __syncthreads();
    
            // only 7x7 = 49 threads will do this
            if(i>0 && i<=bs && j>0 && j<=bs)
            {
               int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
    
               if(rng >= vars[cut])
               {
                  int value = he_shared[in[base]];
                  buf[base]+=value;
                  lim[base]++;
               }
               else
               {
                  buf[base]+=255;
                  lim[base]++;
               }
            }
    
            if(k==0)
                vars[n]--;
    
            __syncthreads();   
    
    
        }// if(n) block closes here
    
        while(vars[n])
        {
    
    
            if(k==0)
            {
                if( vars[ox]==0 && vars[d1] ==3 )
                    vars[d1] = 0; // l2r
                else if( vars[ox]==0 && vars[d1]==2 )
                    vars[d1] = 3; // l u2d
                else if( vars[ox]==width-bs && vars[d1]==0)
                    vars[d1] = 1; // r u2d
                else if( vars[ox]==width-bs && vars[d1]==1)
                    vars[d1] = 2; // r2l
    
            }
    
            // Because this value will be changed so
            // all the threads should set their registers before
            // they move forward
            int ox_d = vars[ox];
            int oy_d = vars[oy];
    
            // Just putting it here so that all the threads should have set their
            // values before moving on, as this value will be changed
            __syncthreads();
    
            if(vars[d1]==0)
            {
    
                if(i == 0 && j < bs)
                {
                    int index = j*width + ox_d + oy_d*width;
                    int index2 = j*width + ox_d + oy_d*width +bs;
    
                    atomicSub(&ary_shared[in[index]],1);
                    atomicAdd(&ary_shared[in[index2]],1);
                }
    
                // The first thread of the first block should set this value
                if(k==0)
                    vars[ox]++;
            }
            else if(vars[d1]==1||vars[d1]==3)
            {
    
                if(i == 0 && j < bs)
                {
                    /*if(j==0)
                    printf("Entered 1||3\n");*/
                    int index = j*width + ox_d + oy_d*width;
                    int index2 = j*width + ox_d + (oy_d+bs)*width;
    
                    atomicSub(&ary_shared[in[index]],1);
                    atomicAdd(&ary_shared[in[index2]],1);
    
                }
                // The first thread of the first block should set this value
                if(k==0)
                    vars[oy]++;
    
            }
            else if(vars[d1]==2)
            {
    
                if(i == 0 && j < bs)
                {
                    int index = j*width + ox_d-1 + oy_d*width;
                    int index2 = j*width + ox_d-1 + oy_d*width +bs;
    
                    atomicAdd(&ary_shared[in[index]],1);
                    atomicSub(&ary_shared[in[index2]],1);
    
                }
                // The first thread of the first block should set this value
                if(k==0 )
                    vars[ox]--;
             }
            __syncthreads();
    
            //ary_shared has been calculated
    
            // Reset the hmin and hminc values
            // again the same task as done in the if(n) loop
            if(k==0)
            {
                vars[hmin]=0;
                vars[hminc]=0;
                vars[hmax]=0;
                vars[hmaxc]=0;
            }
            __syncthreads();
    
            int minth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
            prefix_sum[k] = ary_shared[k];
    
            int l;
            for(l=0;l<=limit;l++)
            {
                sum_s[k]=prefix_sum[k];
    
                if(k >= (int)pow((float)2,(float)l))
                {  
                    prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                    // Find out the minimum index for which the cummulative sum crosses threshold
                    if(prefix_sum[k] > minth)
                    {
                        atomicMin(&vars[hmin],k);
                    }
                }
                __syncthreads();
            }
    
             // set the maximum value here
            if(k==0)
            {
                vars[hminc]=prefix_sum[255];
                vars[hmin]--;
            }
            __syncthreads();
    
            // Calculate maxth
            int maxth = 1>((bs*bs)/20)? 1: ((bs*bs)/20);
            prefix_sum[k] = ary_shared[255-k];
    
            for(l=0;l<=limit;l++)
            {
                sum_s[k]=prefix_sum[k];
    
                if(k >= (int)pow((float)2,(float)l))
                {  
                    prefix_sum[k]+=sum_s[k-(int)pow((float)2,(float)l)];
                    // Find out the minimum index for which the cummulative sum crosses threshold
                    if(prefix_sum[k] > maxth)
                    {
                        atomicMin(&vars[hmax], k);
                    }
                }
                 __syncthreads();
            }
             // set the maximum value here
    
             if(k==0)
             {
                vars[hmaxc]=prefix_sum[255];
                vars[hmax]--;
                vars[hmax]=255-vars[hmax];
             }
            __syncthreads();
    
            int rng = vars[hmax] - vars[hmin];
            if(rng >= vars[cut])
            {
              if( k <= vars[hmin] )
                    he_shared[k] = 0;
              else if( k >= vars[hmax])
                    he_shared[k] = 255;
              else
                    he_shared[k] = (255 * (k - vars[hmin])) / rng;
            }
             __syncthreads();
    
    
            if(i>0 && i<=bs && j>0 && j<=bs)
            {
               int base = (vars[oy]*width+vars[ox])+ (i-1)*width + (j-1);
    
               if(rng >= vars[cut])
               {
                  int value = he_shared[in[base]];
                  buf[base]+=value;
                  lim[base]++;
               }
               else
               {
    
                  buf[base]+=255;
                  lim[base]++;
    
               }
            }
    
            // This just might cause a little bit of problem
            if(k==0)
                vars[n]--;
    
            // All threads will wait here before continuing the while loop
            __syncthreads();
    
        }// end of while(n)
    }
    

1 个答案:

答案 0 :(得分:3)

首先,您需要-arch sm_12(或者在您的情况下,它应该是-arch sm_13)才能启用原子操作。

至于性能,不能保证你的内核比CPU上的普通代码更快 - 有许多问题确实不适合CUDA模型,这些问题可能确实比在CPU上运行得慢得多。在编码任何CUDA内核之前,你需要做一些分析/设计/建模,以防止你浪费大量时间在永远无法飞行的东西上。

话虽如此,可以以更有效的方式实现您的算法 - 也许您可以发布CPU代码,然后邀请有关如何在CUDA中有效实现它的想法?