补充信息

Question

受到here描述的和扫描算法实现的启发，我试图通过以下方式实现它：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
{
    __shared__ unsigned s_offset[1024];

    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
    {
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
        {
            if (data[s + N * j] != 0)
                ++count;
        }
        s_offset[threadIdx.x] = count;

        if ((s - threadIdx.x) == 0)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
            }
        }
        __syncthreads();
        if ((s - threadIdx.x) == 0)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
    }
}

#include <cstdlib>

int main(int argc, char* argv[])
{
    int NTH = 1024; // FAULTY case by default to answer request within comments
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;

    ::srand(42);

    for (int k = 0; k < N*M; ++k)
    {
        if ((rand() % 7) == 0) values[k] = ++count ; 
        else values[k] = 0;
    }

    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
}

当每个块使用128threads执行它时，结果很好，但是，超过128个线程，结果在第128项之外的共享数组中出错。

代码似乎没有在128处说明任何阈值效果，尽管似乎有一个。

汇编ptxinfo：

1>  ptxas info    : Compiling entry function '_Z18count_zeros_sharediiPi' for 'sm_61' 
1>  ptxas info    : Function properties for _Z18count_zeros_sharediiPi 
1>      8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads 
1>  ptxas info    : Used 25 registers, 8 bytes cumulative stack size, 4096 bytes smem, 336 bytes cmem[0]

在GeForce 1060M上执行，使用CUDA 9.0编译，在Windows 10的调试中针对cc 6.1。

在适当的位置调用__syncthreads()例程，但是，自CUDA 9.0以来，其行为可能已更改。我在这里错过了什么吗？

补充信息

在linux上编译：

$> nvcc -G main.cu -arch sm_61

在GeForce GTX 1080上运行结果是类似的问题

然而，

$> nvcc main.cu -arch sm_61

结果看起来不错。

Answer 1

此时存在竞争条件：

    for (int shift = 1; shift < blockDim.x; shift += shift)
    {
        __syncthreads();
        if (threadIdx.x >= shift)
        {
            s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
            ^^^^^^                   ^^^^^^
            write op                 read op

如果我们考虑shift = 1的情况，那么线程1,2,3,4 ......将全部参与。线程1想要读取位置0，并将其添加到位置1.线程2想要读取位置1并将其添加到位置2.这些操作的顺序（以及同样通过其余的线程块）将影响结果。 / p>

使用cuda-memcheck --tool racecheck ...运行代码是可以发现的，尽管它显然取决于不使用-G（见下文）。

我假设您正在尝试实施图39-2 here中指示的步进扫描模式。在这种情况下，我不会对代码进行微不足道的修改，建议使用就地扫描修复此问题，并且仍然可以避免竞争条件。如果我们进行异地扫描，我们可以避免竞争条件。以下示例演示了：

$ cat t271.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
{
    __shared__ unsigned s_offset[1024];
    __shared__ unsigned s_offset2[1024];

    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
    {
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
        {
            if (data[s + N * j] != 0)
                ++count;
        }
        s_offset[threadIdx.x] = count;

//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            s_offset2[threadIdx.x] = s_offset[threadIdx.x];
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset2[threadIdx.x - shift];
            }
            __syncthreads();
        }
//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
    }
}

#include <cstdlib>

int main(int argc, char* argv[])
{
    int NTH = 128;
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;

    ::srand(42);

    for (int k = 0; k < N*M; ++k)
    {
        if ((rand() % 7) == 0) values[k] = ++count ;
        else values[k] = 0;
    }

    printf("count = %d\n", count);
    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
}
$ nvcc -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$

我确定它不是＆＃34;最佳＆＃34;，但这整个练习并非最佳＆＃34;。如果您想要快速扫描，请使用thrust或cub。

回答评论中的问题：我同意，无论是否-G（设备调试代码生成）切换，编译似乎都会影响cuda-memcheck --tool racecheck ...是否报告危险。在Linux CUDA 9.0，GTX960（cc5.2）上，我使用原始提供的代码的次要变体进行以下测试用例：

$ cat t271.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
{
    __shared__ unsigned s_offset[1024];
#ifdef FIX
    __shared__ unsigned s_offset2[1024];
#endif
    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
    {
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
        {
            if (data[s + N * j] != 0)
                ++count;
        }
        s_offset[threadIdx.x] = count;

//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
#ifndef FIX
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];  //line 34
            }
        }
        __syncthreads();
#else
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            s_offset2[threadIdx.x] = s_offset[threadIdx.x];
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset2[threadIdx.x - shift];
            }
            __syncthreads();
        }
#endif
//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
    }
}

#include <cstdlib>

int main(int argc, char* argv[])
{
    int NTH = 128;
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;

    ::srand(42);

    for (int k = 0; k < N*M; ++k)
    {
        if ((rand() % 7) == 0) values[k] = ++count ;
        else values[k] = 0;
    }

    printf("count = %d\n", count);
    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
}
$ nvcc -G -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$ nvcc -lineinfo -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [236072 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1992 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [2369 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [232728 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [913 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [233479 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1841 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [239007 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1833 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [228636 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1689 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [225456 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [2177 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [151696 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1009 hazards]
=========
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [8064 hazards]
=========
========= RACECHECK SUMMARY: 16 hazards displayed (16 errors, 0 warnings)
$

我们看到使用-G进行编译，cuda-memcheck未报告任何竞争条件。如果没有该开关，则报告竞争条件。我现在还没有真正得到解释。我看到两种可能性：

cuda-memcheck
使用-G创建代码的一些特性，它以某种方式完全掩盖了工具中的竞争条件。我确实不确定那是什么。

Answer 2

感谢@RobertCrovella的回答，我更改了代码以避免竞争条件（为了完整性而在此处添加）：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void init_kernel(unsigned* totalcount)
{
        totalcount [threadIdx.x] = 0;
}

#define TRACE 0

__global__ void count_zeros_shared(int N, int M, int* data, unsigned* totalcount)
{
        __shared__ unsigned s_offset[1024];

        for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
        {
                // count non-zero values
                unsigned count = 0;
                for (int j = 0; j < M; ++j)
                {
                        if (data[s + N * j] != 0)
                                ++count;
                }
                s_offset[threadIdx.x] = count;

                #if TRACE
                if ((s - threadIdx.x) == 0)
                        printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
                #endif

                // reduce offset
                for (int shift = 1; shift < blockDim.x; shift += shift)
                {
                        __syncthreads();

                        #if 0 // race condition version

                        if (threadIdx.x >= shift)
                        {
                                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
                        }

                        #else

                        int val = s_offset[threadIdx.x];
                        if (threadIdx.x >= shift)
                        {
                                val += s_offset[threadIdx.x - shift] ;
                        }
                        __syncthreads();
                        if (threadIdx.x >= shift)
                        {
                                s_offset[threadIdx.x] = val ;
                        }

                        #endif
                }
                __syncthreads();

                #if TRACE
                if ((s - threadIdx.x) == 0)
                        printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
                #endif

                if (threadIdx.x == 0)
                        atomicAdd(totalcount, s_offset[blockDim.x - 1]);

                __syncthreads();
        }

}

__global__ void printsum_kernel(int N, unsigned* totalcount)
{
        for (int NTH = 32 ; NTH <= 1024 ; NTH *= 2)
                printf("GPU TOTAL COUNT [BlockDIM = %d] = %d\n", NTH, totalcount[(NTH / 32) - 1]);
}


#include <cstdlib>

int main(int argc, char* argv[])
{
        cudaError_t cuerr;

        int* values;
        unsigned* totalcount;
        int N = 1024 * 48, M = 448;

        cuerr = ::cudaMalloc(&totalcount, (1024/32) * sizeof(unsigned)) ;
        if (cuerr != cudaSuccess) return cuerr;

        cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
        if (cuerr != cudaSuccess) return cuerr;

        int count = 0;

        ::srand(42);

        for (int k = 0; k < N*M; ++k)
        {
                if ((rand() % 7) == 0) values[k] = ++count ;
                else values[k] = 0;
        }

        init_kernel << < 1, 1024/32 >> > (totalcount);
        cuerr = ::cudaDeviceSynchronize();
        if (cuerr != cudaSuccess) return cuerr;

        for (int NTH = 32; NTH <= 1024; NTH *= 2)
        {
                printf("RUNNING %d threads per block\n", NTH);

                count_zeros_shared << < N / NTH, NTH >> > (N, M, values, totalcount + ((NTH / 32) - 1));
                cuerr = ::cudaDeviceSynchronize();
                if (cuerr != cudaSuccess) return cuerr;
        }

        printsum_kernel << < 1, 1 >> > (1024/32, totalcount);
        cuerr = ::cudaDeviceSynchronize();
        if (cuerr != cudaSuccess) return cuerr;

        printf("GROUND TRUTH TOTAL COUNT = %d\n", count);

        return ::cudaDeviceReset();
}

然而，cuda-memcheck工具表现得很误导：

$> nvcc -G main3.cu -arch sm_61 -o a3.out
$>  cuda-memcheck --version
CUDA-MEMCHECK version 9.0.176 ID:(44)
$> cuda-memcheck --tool racecheck ./a3.out
========= CUDA-MEMCHECK
RUNNING 32 threads per block
RUNNING 64 threads per block
RUNNING 128 threads per block
RUNNING 256 threads per block
RUNNING 512 threads per block
RUNNING 1024 threads per block
GPU TOTAL COUNT [BlockDIM = 32] = 3145571
GPU TOTAL COUNT [BlockDIM = 64] = 3145571
GPU TOTAL COUNT [BlockDIM = 128] = 3145571
GPU TOTAL COUNT [BlockDIM = 256] = 3145571
GPU TOTAL COUNT [BlockDIM = 512] = 3145571
GPU TOTAL COUNT [BlockDIM = 1024] = 3145571
GROUND TRUTH TOTAL COUNT = 3145571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$> ./a3.out
RUNNING 32 threads per block
RUNNING 64 threads per block
RUNNING 128 threads per block
RUNNING 256 threads per block
RUNNING 512 threads per block
RUNNING 1024 threads per block
GPU TOTAL COUNT [BlockDIM = 32] = 3145571
GPU TOTAL COUNT [BlockDIM = 64] = 3145571
GPU TOTAL COUNT [BlockDIM = 128] = 3161857
GPU TOTAL COUNT [BlockDIM = 256] = 3200816
GPU TOTAL COUNT [BlockDIM = 512] = 3231303
GPU TOTAL COUNT [BlockDIM = 1024] = 3925122
GROUND TRUTH TOTAL COUNT = 3145571

$> nvidia-smi
Fri Jan  5 18:29:33 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.98                 Driver Version: 384.98                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 1080    Off  | 00000000:02:00.0 Off |                  N/A |
| 29%   44C    P0    39W / 180W |      0MiB /  8112MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+

运行cuda-memcheck实际上隐藏了这个问题。

没有竞争条件的代码版本工作正常，但代价是补充syncthreads（但没有补充共享内存）。

Summed Scan - 如果线程数量很大，则结果错误

补充信息

2 个答案: