Question

请考虑我从教程中获得的以下代码和附带的说明图像。其目的是证明CUDA的平行减少。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <numeric>
using namespace std;

__global__ void sumSingleBlock(int* d)
{
  int tid = threadIdx.x;

  // Number of participating threads (tc) halves on each iteration
  for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1)
  {
    // Thread must be allowed to write
    if (tid < tc)
    {
      // We need to do A + B, where B is the element following A, so first we 
      // need to find the position of element A and of element B      
      int posA = tid * stepSize * 2;
      int posB = posA + stepSize;

      // Update the value at posA by adding the value at posB to it
      d[posA] += d[posB];
    }
  }
}

int main()
{
  cudaError_t status;

  const int count = 8;
  const int size = count * sizeof(int);
  int* h = new int[count];
  for (int i = 0; i < count; ++i)
    h[i] = i+1;

  int* d;
  status = cudaMalloc(&d, size);

  status = cudaMemcpy(d,h,size, cudaMemcpyHostToDevice);

  sumSingleBlock<<<1,count/2>>>(d);

  int result;
  status = cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost);

  cout << "Sum is " << result << endl;

  getchar();

  cudaFree(d);
  delete [] h;

  return 0;
}

现在，我可以理解图中所示的减少的一般原则。我没有得到的是如何添加（*）：

很明显，所有四个线程都会以相同的次数运行循环;只有tid < tc他们会做一些有用的事情。线程＃0加1和2并将结果存储在元素0中。第二次迭代然后访问元素2.同时，线程＃1的第一次迭代是添加3和4并将结果存储在元素2中。

如果线程＃0在线程＃1完成迭代1之前开始迭代2怎么办？这意味着线程＃0可以读取3而不是7，或者可能是撕裂的值（？）这里没有任何同步，代码是错误的吗？

（*）注意：我不确定没有竞争条件，我完全相信教程中的安全代码是正确的。

Answer 1

代码错误，需要进行__syncthreads()调用，如下所示。

__global__ void sumSingleBlock(int* d)
{
  int tid = threadIdx.x;

  // Number of participating threads (tc) halves on each iteration
  for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1)
  {
    // Thread must be allowed to write
    if (tid < tc)
    {
      // We need to do A + B, where B is the element following A, so first we 
      // need to find the position of element A and of element B      
      int posA = tid * stepSize * 2;
      int posB = posA + stepSize;

      // Update the value at posA by adding the value at posB to it
      d[posA] += d[posB];
    }
     __syncthreads();
  }
}

CUDA减少 - 竞争条件？

1 个答案: