Question

__ syncthreads（）在条件代码中是允许的，但仅当条件在整个线程块中进行相同的求值时，否则代码执行可能会挂起或产生意外的副作用。

我尝试通过以下代码使内核挂起：

#include <stdio.h>

__global__ void test(int warpSize)
{
    int i = threadIdx.x;
    if (i < warpSize) {
        __syncthreads();
    }
    else {
        __syncthreads();
    }
}

int main(int argc,char **argv)
{
    int device; 
    cudaDeviceProp prop; 
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    test<<<1, 2 * prop.warpSize>>>(prop.warpSize);

    printf("done");
    return 0;
}

但该计划正常退出。

据我了解，内核存在两个障碍。 if块中的屏障将等待warp＃1的完成，而else-block中的屏障将等待warp＃0的完成。我误解了__syncthreads()吗？或者条件代码中的__syncthreads()总是运行，即使它在'非活动'执行路径中？

Answer 1

根据评论，代码应该更复杂，以便编译器不会优化内核。此外，如果没有同步，则CPU线程不会被某些挂起的内核阻塞。

修改后的代码：

#include <stdio.h>

__global__ void test(int warpSize, int *d_dummy)
{
    int i = threadIdx.x;
    __shared__ int tmp;
    tmp = 0;
    __syncthreads();

    if (i < warpSize) {
        tmp += 1;
        __syncthreads();
        tmp += 2;
    }
    else {
        tmp -= 3;
        __syncthreads();
        tmp -= 4;
    }
    __syncthreads();
    d_dummy[0] = tmp;
}

int main(int argc,char **argv)
{
    int device; 
    cudaDeviceProp prop; 
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    int h_dummy[1], *d_dummy;
    cudaMalloc(&d_dummy, 1 * sizeof(int));

    test<<<1, 2 * prop.warpSize>>>(prop.warpSize, d_dummy);
    cudaMemcpy(h_dummy, d_dummy, 1 * sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();

    printf("done %d", h_dummy[0]);
    return 0;
}

但是，当块内的warp不在同一执行路径上时，__syncthreads()的行为未定义。所以我们不能指望程序挂起。

条件代码中的__syncthreads（）是否始终运行，即使它位于“非活动”执行路径中？

1 个答案: