Question

我有一个CUDA内核，其中块需要在执行新块之前完成每个先前的块。我用这样的两个原子计数器实现了这种同步：

__global__
static void waitTest(int* counters)
{
    __shared__ int orderedBlockId;
    int tid = threadIdx.x;

    if(tid == 0){
        orderedBlockId = atomicAdd(&counters[0], 1 );
        //wait on previous group of 16 blocks
        int expectedCounter = orderedBlockId / 16 * 16;
        while(atomicAdd(&counters[1],0) < expectedCounter){
            //wait
        }
    }
    __syncthreads();

    //do something

    __syncthreads();

    if(tid == 0){
        atomicAdd( &counters[1], 1 );
    }
}

int main(){
    thrust::device_vector<int> counters(2,0);
    waitTest<<<128,128>>>(thrust::raw_pointer_cast(counters.data()));
}

我的问题：

是否有更便宜的方式强制全局内存读取而不是atomicAdd(&counters[1],0)？仅用counters[1]超时内核替换它。

Answer 1

正如罗伯特在评论中提到的，counters必须被声明为volatile。为了将它们传递给atomicAdd，必须将计数器强制转换为原始类型（不volatile）。代码：

__global__
static void waitTest(volatile int* counters)
{
    __shared__ int orderedBlockId;
    int tid = threadIdx.x;

    if(tid == 0){
        orderedBlockId = atomicAdd( (int*) &counters[0], 1 );
        //wait on previous group of 16 blocks
        int expectedCounter = orderedBlockId / 16 * 16;
        while(counters[1] < expectedCounter){
            //wait
        }
    }
    __syncthreads();

    //do something

    __syncthreads();

    if(tid == 0){
        atomicAdd( (int*) &counters[1], 1 );
    }
}

在没有atomicAdd（x，0）的前一个块上激活等待

1 个答案: