我有一个CUDA内核,其中块需要在执行新块之前完成每个先前的块。我用这样的两个原子计数器实现了这种同步:
__global__
static void waitTest(int* counters)
{
__shared__ int orderedBlockId;
int tid = threadIdx.x;
if(tid == 0){
orderedBlockId = atomicAdd(&counters[0], 1 );
//wait on previous group of 16 blocks
int expectedCounter = orderedBlockId / 16 * 16;
while(atomicAdd(&counters[1],0) < expectedCounter){
//wait
}
}
__syncthreads();
//do something
__syncthreads();
if(tid == 0){
atomicAdd( &counters[1], 1 );
}
}
int main(){
thrust::device_vector<int> counters(2,0);
waitTest<<<128,128>>>(thrust::raw_pointer_cast(counters.data()));
}
我的问题:
是否有更便宜的方式强制全局内存读取而不是atomicAdd(&counters[1],0)
?
仅用counters[1]
超时内核替换它。
答案 0 :(得分:1)
正如罗伯特在评论中提到的,counters
必须被声明为volatile
。为了将它们传递给atomicAdd
,必须将计数器强制转换为原始类型(不volatile
)。代码:
__global__
static void waitTest(volatile int* counters)
{
__shared__ int orderedBlockId;
int tid = threadIdx.x;
if(tid == 0){
orderedBlockId = atomicAdd( (int*) &counters[0], 1 );
//wait on previous group of 16 blocks
int expectedCounter = orderedBlockId / 16 * 16;
while(counters[1] < expectedCounter){
//wait
}
}
__syncthreads();
//do something
__syncthreads();
if(tid == 0){
atomicAdd( (int*) &counters[1], 1 );
}
}