Question

我想在CUDA上实现Inter-block障碍，但遇到了严重的问题。

我无法弄清楚为什么它不起作用。

#include <iostream>
#include <cstdlib>
#include <ctime>

#define SIZE 10000000
#define BLOCKS 100 

using namespace std;

struct Barrier {
    int *count;

    __device__ void wait() {
        atomicSub(count, 1);
        while(*count)
            ;
    }

    Barrier() {
        int blocks = BLOCKS;
        cudaMalloc((void**) &count, sizeof(int));
        cudaMemcpy(count, &blocks, sizeof(int), cudaMemcpyHostToDevice);
    }

    ~Barrier() {
        cudaFree(count);
    }
};


__global__ void sum(int* vec, int* cache, int *sum, Barrier barrier)
{
    int tid = blockIdx.x;

    int temp = 0;
    while(tid < SIZE) {
        temp += vec[tid];
        tid += gridDim.x;
    }

    cache[blockIdx.x] = temp;

    barrier.wait();

    if(blockIdx.x == 0) {
        for(int i = 0 ; i < BLOCKS; ++i)
            *sum += cache[i];
    }
}

int main()
{
    int* vec_host = (int *) malloc(SIZE * sizeof(int));    
    for(int i = 0; i < SIZE; ++i)
        vec_host[i] = 1;

    int *vec_dev;
    int *sum_dev;
    int *cache;
    int sum_gpu = 0;

    cudaMalloc((void**) &vec_dev, SIZE * sizeof(int));
    cudaMemcpy(vec_dev, vec_host, SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**) &sum_dev, sizeof(int));
    cudaMemcpy(sum_dev, &sum_gpu, sizeof(int), cudaMemcpyHostToDevice);
    cudaMalloc((void**) &cache, BLOCKS * sizeof(int));
    cudaMemset(cache, 0, BLOCKS * sizeof(int));

    Barrier barrier;
    sum<<<BLOCKS, 1>>>(vec_dev, cache, sum_dev, barrier);

    cudaMemcpy(&sum_gpu, sum_dev, sizeof(int), cudaMemcpyDeviceToHost);

    cudaFree(vec_dev);
    cudaFree(sum_dev);
    cudaFree(cache);
    free(vec_host);
    return 0;
}

实际上，即使我将wait（）重写为以下

    __device__ void wait() {
        while(*count != 234124)
            ;
    }

程序正常退出。但在这种情况下，我希望得到一个无限循环。

Answer 1

不幸的是，在CUDA中，您想要实现的目标（块间通信/同步）并不严格。 CUDA编程指南指出“线程块需要独立执行：必须能够以任何顺序，并行或串行执行它们。”这种限制的原因是允许线程块调度程序具有灵活性，并允许代码在核心数量上进行扩展。唯一支持的块间同步方法是启动另一个内核：内核启动（在同一个流中）是隐式同步点。

您的代码违反了块独立性规则，因为它隐含地假设您的内核的线程块并发执行（参见并行）。但不能保证他们这样做。要了解为什么这对您的代码很重要，让我们考虑一个只有一个核心的假设GPU。我们还假设您只想启动两个线程块。在这种情况下，你的spinloop内核实际上会死锁。如果首先在核心上调度线程块零，它将在到达障碍时永远循环，因为线程块1永远不会有机会更新计数器。因为线程块零永远不会被换出（线程块执行到它们的完成），所以当它旋转时它会使线程块占据核心块之一。

有些人尝试过像你这样的方案并且已经看到了成功，因为调度程序碰巧偶然安排了阻塞，以便假设得以实现。例如，有一段时间启动尽可能多的线程块，因为GPU具有SM意味着这些块是真正同时执行的。但是当对驱动程序或CUDA运行时或GPU的更改使该假设无效时，他们感到失望，打破了他们的代码。

对于您的应用程序，尝试找到一个不依赖于块间同步的解决方案，因为（除非对CUDA编程模型有意义的改变），这是不可能的。

Answer 2

可以阻止块同步。见paper 本文没有详细介绍它是如何工作的，但它依赖于__syncthreads（）的操作;为当前块创建暂停屏障，...等待其他块到达同步点。

本文未提及的一个项目是，只有当块数足够小或者SM的数量足够用于手头的任务时才能进行同步。即如果你有4个SM并且正在尝试同步5个块，那么内核将会死锁。

通过他们的方法，我已经能够在许多块中传播一个长串行任务，在单块方法中轻松节省30％的时间。即块同步对我有用。

Answer 3

看起来编译器优化问题。我不擅长阅读PTX代码，但看起来编译器完全忽略了while - 循环（即使用-O0编译时）：

.loc    3   41  0
cvt.u64.u32     %rd7, %ctaid.x; // Save blockIdx.x to rd7
ld.param.u64    %rd8, [__cudaparm__Z3sumPiS_S_7Barrier_cache];
mov.s32     %r8, %ctaid.x; // Now calculate ouput address
mul.wide.u32    %rd9, %r8, 4;
add.u64     %rd10, %rd8, %rd9;
st.global.s32   [%rd10+0], %r5; // Store result to cache[blockIdx.x]
.loc    17  128 0
ld.param.u64    %rd11, [__cudaparm__Z3sumPiS_S_7Barrier_barrier+0]; // Get *count to rd11
mov.s32     %r9, -1; // put -1 to r9
atom.global.add.s32     %r10, [%rd11], %r9; // Do AtomicSub, storing the result to r10 (will be unused)
cvt.u32.u64     %r11, %rd7; // Put blockIdx.x saved in rd7 to r11
mov.u32     %r12, 0; // Put 0 to r12
setp.ne.u32     %p3, %r11, %r12; // if(blockIdx.x == 0)
@%p3 bra    $Lt_0_5122;
ld.param.u64    %rd12, [__cudaparm__Z3sumPiS_S_7Barrier_sum];
ld.global.s32   %r13, [%rd12+0];
mov.s64     %rd13, %rd8;
mov.s32     %r14, 0;

在CPU代码的情况下，通过使用volatile前缀声明变量来防止此类行为。但即使我们将count声明为int __device__ count（并适当更改代码），添加volatile说明符只会中断编译（错误可能会argument of type "volatile int *" is incompatible with parameter of type "void *"）

我建议从CUDA SDK查看threadFenceReduction示例。在那里它们与你的工作几乎完全相同，但是在运行时选择进行最终求和的块，而不是预定义，并且while - 循环被消除，因为全局变量的自旋锁应该< em>非常慢。

CUDA上的块间障碍

3 个答案: