Question

我正在查看此sum_reduction.cu示例和tutorial，并注意到对于某些问题规模，它不起作用，例如它适用于问题大小n = 2000但不适用于n = 3000。显然它始终适用于块大小的多个问题大小，但是教程和示例代码都没有说明。问题是，这种约简算法是否仅适用于某些问题规模？他们选择的例子是N = 256k，它是偶数，2的幂，也是块大小512的倍数。

对于自我遏制，我在这里粘贴代码的最重要部分（模板版本）：

template<typename T>
__global__ void kernelSum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
    extern __shared__ T sdata[];

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    T x = 0.0;
    if (tid < n) {
        x = input[tid];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
        if(threadIdx.x < offset) {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0) {
        per_block_results[blockIdx.x] = sdata[0];
    }
}

并调用内核：

// launch one kernel to compute, per-block, a partial sum
block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);

// launch a single block to compute the sum of the partial sums
block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);

根据我的理解，如果问题大小小于块减少，则此语句T x = 0.0;确保元素被清零，因此应该可以工作，但不是吗？

更新：对不起浮动/双重事情在准备问题而不是真正的问题时是一个错字。

Answer 1

您发布的代码与模板化内核不一致被称为kernelSum，但你正在调用一些叫做的东西 block_sum。

此外，我不相信你对模板化内核的使用功能可能正如所写：

block_sum<double> <<<num_blocks,block_size,block_size * sizeof(float)>>>(d_input, d_partial_sums_and_total, num_elements);
           ^                                                     ^
           |  these types are required to match                  |

正在使用类型double实例化内核模板。因此，基于以下行，它需要足够的共享内存来存储block_size double数量：

extern __shared__ T sdata[];

但是你只传递了所需存储空间的一半：

block_size * sizeof(float)

我相信这会给你意想不到的结果。

写入的缩减期望该块由于这个循环，维度是2的幂：
```
// contiguous range pattern
for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
```
这在第一次内核调用时可能不是问题，因为您可能会为每个块的线程数block_size选择2的幂：
```
block_sum<double> <<<num_blocks,block_size,...
```
但是，对于第二次内核调用，这取决于num_blocks是2的幂，这取决于您未显示的网格计算：
```
block_sum<double> <<<1,num_blocks,...
```
最后，如果num_blocks超出设备限制，则第一次内核启动将失败。对于非常大的数据集可能会发生这种情况，但可能不适用于3000，这取决于您未显示的网格计算。

上面的第3项是任意矢量大小的动态满足的困难要求。因此，我建议采用另一种减少策略来处理任意大小的向量。为此，我建议您研究CUDA reduction sample code and presentation。

这是一个完整的程序，主要基于您展示的代码，解决了上述问题，并且对我来说似乎适用于3000的大小：

#include <stdio.h>
#include <stdlib.h>
#define DSIZE 3000
#define nTPB 256



template<typename T>
__global__ void block_sum(const T* __restrict__ input, T* __restrict__ per_block_results, const size_t n) {
    extern __shared__ T sdata[];

    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    T x = 0.0;
    if (tid < n) {
        x = input[tid];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1) {
        if(threadIdx.x < offset) {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }
        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0) {
        per_block_results[blockIdx.x] = sdata[0];
    }
}


int main(){

  double *d_input, *d_partial_sums_and_total, *h_input, *h_partial_sums_and_total;

  int num_elements=DSIZE;
  int block_size = nTPB;
  int num_blocks = (num_elements + block_size -1)/block_size;
  // bump num_blocks up to the next power of 2
  int done = 0;
  int test_val = 1;
  while (!done){
   if (test_val >= num_blocks){
     num_blocks = test_val;
     done = 1;}
   else test_val *= 2;
   if (test_val > 65535) {printf("blocks failure\n"); exit(1);}
   }


  h_input = (double *)malloc(num_elements * sizeof(double));
  h_partial_sums_and_total = (double *)malloc((num_blocks+1)*sizeof(double));

  cudaMalloc((void **)&d_input, num_elements * sizeof(double));
  cudaMalloc((void **)&d_partial_sums_and_total, (num_blocks+1)*sizeof(double));

  double h_result = 0.0;
  for (int i = 0; i < num_elements; i++) {
    h_input[i] = rand()/(double)RAND_MAX;
    h_result += h_input[i];}

  cudaMemcpy(d_input, h_input, num_elements*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemset(d_partial_sums_and_total, 0, (num_blocks+1)*sizeof(double));

// launch one kernel to compute, per-block, a partial sum
  block_sum<double> <<<num_blocks,block_size,block_size * sizeof(double)>>>(d_input, d_partial_sums_and_total, num_elements);

// launch a single block to compute the sum of the partial sums
  block_sum<double> <<<1,num_blocks,num_blocks * sizeof(double)>>>(d_partial_sums_and_total, d_partial_sums_and_total + num_blocks, num_blocks);

  cudaMemcpy(h_partial_sums_and_total, d_partial_sums_and_total, (num_blocks+1)*sizeof(double), cudaMemcpyDeviceToHost);

  printf("host result   = %lf\n", h_result);
  printf("device result = %lf\n", h_partial_sums_and_total[num_blocks]);
}

为了简洁/可读性，我在上面的代码中省略了错误检查。如果遇到cuda代码有困难，您应该始终proper cuda error checking。

此外，将来，如果您发布完整的代码来演示您正在做的事情，您将更容易让其他人帮助您，就像我上面所做的那样。

在CUDA中减少块的错误尺寸？

1 个答案: