请考虑我从教程中获得的以下代码和附带的说明图像。其目的是证明CUDA的平行减少。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <numeric>
using namespace std;
__global__ void sumSingleBlock(int* d)
{
int tid = threadIdx.x;
// Number of participating threads (tc) halves on each iteration
for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1)
{
// Thread must be allowed to write
if (tid < tc)
{
// We need to do A + B, where B is the element following A, so first we
// need to find the position of element A and of element B
int posA = tid * stepSize * 2;
int posB = posA + stepSize;
// Update the value at posA by adding the value at posB to it
d[posA] += d[posB];
}
}
}
int main()
{
cudaError_t status;
const int count = 8;
const int size = count * sizeof(int);
int* h = new int[count];
for (int i = 0; i < count; ++i)
h[i] = i+1;
int* d;
status = cudaMalloc(&d, size);
status = cudaMemcpy(d,h,size, cudaMemcpyHostToDevice);
sumSingleBlock<<<1,count/2>>>(d);
int result;
status = cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost);
cout << "Sum is " << result << endl;
getchar();
cudaFree(d);
delete [] h;
return 0;
}
现在,我可以理解图中所示的减少的一般原则。我没有得到的是如何添加(*):
很明显,所有四个线程都会以相同的次数运行循环;只有tid < tc
他们会做一些有用的事情。线程#0加1和2并将结果存储在元素0中。第二次迭代然后访问元素2.同时,线程#1的第一次迭代是添加3和4并将结果存储在元素2中。
如果线程#0在线程#1完成迭代1之前开始迭代2怎么办?这意味着线程#0可以读取3而不是7,或者可能是撕裂的值(?)这里没有任何同步,代码是错误的吗?
(*)注意:我不确定没有竞争条件,我完全相信教程中的安全代码是正确的。
答案 0 :(得分:1)
代码错误,需要进行__syncthreads()
调用,如下所示。
__global__ void sumSingleBlock(int* d)
{
int tid = threadIdx.x;
// Number of participating threads (tc) halves on each iteration
for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1)
{
// Thread must be allowed to write
if (tid < tc)
{
// We need to do A + B, where B is the element following A, so first we
// need to find the position of element A and of element B
int posA = tid * stepSize * 2;
int posB = posA + stepSize;
// Update the value at posA by adding the value at posB to it
d[posA] += d[posB];
}
__syncthreads();
}
}