我正在尝试执行包容性扫描以查找数组的累积总和。按照harrism here给出的建议,我使用的是给定here的过程,但是按照这些作者的建议,我正在尝试编写代码,每个线程计算4个元素而不是1个元素屏蔽内存延迟。
我远离推力因为性能至关重要,我需要多流功能。我刚刚发现了CUB,这将是我的下一步努力,但我想要一个多块解决方案,并且还想知道我在现有代码上出错的地方,就像更好地理解CUDA的练习一样。
下面的代码为每个块分配4个数据元素,其中每个块必须具有32个线程的倍数。我的数据将有128个线程的倍数,所以这个限制对我来说是可以接受的。足够的共享内存分配给4*blockDim.x
元素的每个块以及另外32个元素以在warp之间求和。 scanBlockAnyLength
然后添加必要的偏移量以纠正warp之间的不匹配,将每个warp的最终值保存到设备全局内存中的dev_blockSum
。 sumWarp4_32
然后扫描此数组以查找最终更正块之间不匹配的内容,然后将其添加到kernel_sumBlock
#include<cuda.h>
#include<iostream>
using std::cout;
using std::endl;
#define MAX_THREADS 1024
#define MAX_BLOCKS 65536
#define N 512
__device__ float sumWarp4_128(float* ptr, const int tidx = threadIdx.x) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 threads per warp
unsigned int i = warpid*128+lane; //first element of block data set this thread looks at
if( lane >= 1 ) ptr[i] += ptr[i-1];
if( lane >= 2 ) ptr[i] += ptr[i-2];
if( lane >= 4 ) ptr[i] += ptr[i-4];
if( lane >= 8 ) ptr[i] += ptr[i-8];
if( lane >= 16 ) ptr[i] += ptr[i-16];
if( lane==0 ) ptr[i+32] += ptr[i+31];
if( lane >= 1 ) ptr[i+32] += ptr[i+32-1];
if( lane >= 2 ) ptr[i+32] += ptr[i+32-2];
if( lane >= 4 ) ptr[i+32] += ptr[i+32-4];
if( lane >= 8 ) ptr[i+32] += ptr[i+32-8];
if( lane >= 16 ) ptr[i+32] += ptr[i+32-16];
if( lane==0 ) ptr[i+64] += ptr[i+63];
if( lane >= 1 ) ptr[i+64] += ptr[i+64-1];
if( lane >= 2 ) ptr[i+64] += ptr[i+64-2];
if( lane >= 4 ) ptr[i+64] += ptr[i+64-4];
if( lane >= 8 ) ptr[i+64] += ptr[i+64-8];
if( lane >= 16 ) ptr[i+64] += ptr[i+64-16];
if( lane==0 ) ptr[i+96] += ptr[i+95];
if( lane >= 1 ) ptr[i+96] += ptr[i+96-1];
if( lane >= 2 ) ptr[i+96] += ptr[i+96-2];
if( lane >= 4 ) ptr[i+96] += ptr[i+96-4];
if( lane >= 8 ) ptr[i+96] += ptr[i+96-8];
if( lane >= 16 ) ptr[i+96] += ptr[i+96-16];
return ptr[i+96];
}
__host__ __device__ float sumWarp4_32(float* ptr, const int tidx = threadIdx.x) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 elements per warp
unsigned int i = warpid*32+lane; //first element of block data set this thread looks at
if( lane >= 1 ) ptr[i] += ptr[i-1];
if( lane >= 2 ) ptr[i] += ptr[i-2];
if( lane >= 4 ) ptr[i] += ptr[i-4];
if( lane >= 8 ) ptr[i] += ptr[i-8];
if( lane >= 16 ) ptr[i] += ptr[i-16];
return ptr[i];
}
__device__ float sumBlock4(float* ptr, const int tidx = threadIdx.x, const int bdimx = blockDim.x ) {
const unsigned int lane = tidx & 31;
const unsigned int warpid = tidx >> 5; //32 threads per warp
float val = sumWarp4_128(ptr);
__syncthreads();//should be included
if( tidx==bdimx-1 ) ptr[4*bdimx+warpid] = val;
__syncthreads();
if( warpid==0 ) sumWarp4_32((float*)&ptr[4*bdimx]);
__syncthreads();
if( warpid>0 ) {
ptr[warpid*128+lane] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+32] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+64] += ptr[4*bdimx+warpid-1];
ptr[warpid*128+lane+96] += ptr[4*bdimx+warpid-1];
}
__syncthreads();
return ptr[warpid*128+lane+96];
}
__device__ void scanBlockAnyLength4(float *ptr, float* dev_blockSum, const float* dev_input, float* dev_output, const int idx = threadIdx.x, const int bdimx = blockDim.x, const int bidx = blockIdx.x) {
const unsigned int lane = idx & 31;
const unsigned int warpid = idx >> 5;
ptr[lane+warpid*128] = dev_input[lane+warpid*128+bdimx*bidx*4];
ptr[lane+warpid*128+32] = dev_input[lane+warpid*128+bdimx*bidx*4+32];
ptr[lane+warpid*128+64] = dev_input[lane+warpid*128+bdimx*bidx*4+64];
ptr[lane+warpid*128+96] = dev_input[lane+warpid*128+bdimx*bidx*4+96];
__syncthreads();
float val = sumBlock4(ptr);
__syncthreads();
dev_blockSum[0] = 0.0f;
if( idx==0 ) dev_blockSum[bidx+1] = ptr[bdimx*4-1];
dev_output[lane+warpid*128+bdimx*bidx*4] = ptr[lane+warpid*128];
dev_output[lane+warpid*128+bdimx*bidx*4+32] = ptr[lane+warpid*128+32];
dev_output[lane+warpid*128+bdimx*bidx*4+64] = ptr[lane+warpid*128+64];
dev_output[lane+warpid*128+bdimx*bidx*4+96] = ptr[lane+warpid*128+96];
__syncthreads();
}
__global__ void kernel_sumBlock(float* dev_blockSum, const float* dev_input, float* dev_output ) {
extern __shared__ float ptr[];
scanBlockAnyLength4(ptr,dev_blockSum,dev_input,dev_output);
}
__global__ void kernel_offsetBlocks(float* dev_blockSum, float* dev_arr) {
const int tidx = threadIdx.x;
const int bidx = blockIdx.x;
const int bdimx = blockDim.x;
const int lane = tidx & 31;
const int warpid = tidx >> 5;
if( warpid==0 ) sumWarp4_32(dev_blockSum);
float val = dev_blockSum[warpid];
dev_arr[warpid*128+lane] += val;
dev_arr[warpid*128+lane+32] += val;
dev_arr[warpid*128+lane+64] += val;
dev_arr[warpid*128+lane+96] += val;
}
void scan4( const float input[], float output[]) {
int blocks = 2;
int threadsPerBlock = 64; //multiple of 32
int smemsize = (threadsPerBlock*4+32)*sizeof(float);
float* dev_input, *dev_output;
cudaMalloc((void**)&dev_input,blocks*threadsPerBlock*4*sizeof(float));
cudaMalloc((void**)&dev_output,blocks*threadsPerBlock*4*sizeof(float));
float *dev_blockSum;
cudaMalloc((void**)&dev_blockSum,blocks*sizeof(float));
int offset = 0;
int Nrem = N;
int chunksize;
while( Nrem ) {
chunksize = max(Nrem,blocks*threadsPerBlock*4);
cudaMemcpy(dev_input,(void**)&input[offset],chunksize*sizeof(float),cudaMemcpyHostToDevice);
kernel_sumBlock<<<blocks,threadsPerBlock,smemsize>>>(dev_blockSum,dev_input,dev_output);
kernel_offsetBlocks<<<blocks,threadsPerBlock>>>(dev_blockSum,dev_output);
cudaMemcpy((void**)&output[offset],dev_output,chunksize*sizeof(float),cudaMemcpyDeviceToHost);
offset += chunksize;
Nrem -= chunksize;
}
cudaFree(dev_input);
cudaFree(dev_output);
}
int main() {
float h_vec[N], sol[N];
for( int i = 0; i < N; i++ ) h_vec[i] = (float)i+1.0f;
scan4(h_vec,sol);
cout << "solution:" << endl;
for( int i = 0; i < N; i++ ) cout << i << " " << (i+2)*(i+1)/2 << " " << sol[i] << endl;
return 0;
}
在我看来,代码会抛出错误,因为sumWarp4_128
中的行不会在warp中按顺序执行。即,if( lane==0 )
行在其前面的其他逻辑块之前执行。我认为这在经线中是不可能的。
如果我在__syncthreads()
来电之前和之后lane==0
,我会得到一些新奇特的错误,我无法弄明白。
任何帮助指出我出错的地方都将不胜感激
答案 0 :(得分:2)
由于不共享数据的线程之间的同步,您编写的代码具有竞争条件。虽然这可以在当前硬件上进行,以便在warp内进行通信(所谓的warp-synchronous编程),但是非常不鼓励它,因为代码中的竞争条件可能导致它在未来可能的硬件上失败。
虽然通过每个线程处理多个项目可以获得更高的性能,但4不是一个神奇的数字 - 如果可能的话,你应该把它作为一个可调参数。例如,CUDPP每个线程使用8个。
我强烈建议您使用CUB。您应该使用cub::BlockLoad()
为每个帖子加载多个项目,并使用cub::BlockScan()
来扫描它们。那么你只需要一些代码来组合多个块。获得带宽效率最高的方法是使用Thrust使用的“Reduce-Scan-Scan”方法。首先减少每个块(cub :: BlockReduce)并将每个块的总和存储到blockSums
数组。然后扫描该数组以获得每块偏移量。然后在块上执行cub :: BlockScan,并将先前计算的每块偏移量添加到每个元素。