Summed Scan - 如果线程数量很大,则结果错误

时间:2018-01-05 11:24:54

标签: cuda


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
    __shared__ unsigned s_offset[1024];

    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
            if (data[s + N * j] != 0)
        s_offset[threadIdx.x] = count;

        if ((s - threadIdx.x) == 0)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
        for (int shift = 1; shift < blockDim.x; shift += shift)
            if (threadIdx.x >= shift)
                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
        if ((s - threadIdx.x) == 0)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

#include <cstdlib>

int main(int argc, char* argv[])
    int NTH = 1024; // FAULTY case by default to answer request within comments
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;


    for (int k = 0; k < N*M; ++k)
        if ((rand() % 7) == 0) values[k] = ++count ; 
        else values[k] = 0;

    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();




1>  ptxas info    : Compiling entry function '_Z18count_zeros_sharediiPi' for 'sm_61' 
1>  ptxas info    : Function properties for _Z18count_zeros_sharediiPi 
1>      8 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads 
1>  ptxas info    : Used 25 registers, 8 bytes cumulative stack size, 4096 bytes smem, 336 bytes cmem[0]

在GeForce 1060M上执行,使用CUDA 9.0编译,在Windows 10的调试中针对cc 6.1。

在适当的位置调用__syncthreads()例程,但是,自CUDA 9.0以来,其行为可能已更改。我在这里错过了什么吗?



$> nvcc -G -arch sm_61

在GeForce GTX 1080上运行结果是类似的问题


$> nvcc -arch sm_61


2 个答案:

答案 0 :(得分:1)


    for (int shift = 1; shift < blockDim.x; shift += shift)
        if (threadIdx.x >= shift)
            s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
            ^^^^^^                   ^^^^^^
            write op                 read op

如果我们考虑shift = 1的情况,那么线程1,2,3,4 ......将全部参与。线程1想要读取位置0,并将其添加到位置1.线程2想要读取位置1并将其添加到位置2.这些操作的顺序(以及同样通过其余的线程块)将影响结果。 / p>

使用cuda-memcheck --tool racecheck ...运行代码是可以发现的,尽管它显然取决于不使用-G(见下文)。

我假设您正在尝试实施图39-2 here中指示的步进扫描模式。在这种情况下,我不会对代码进行微不足道的修改,建议使用就地扫描修复此问题,并且仍然可以避免竞争条件。如果我们进行异地扫描,我们可以避免竞争条件。以下示例演示了:

$ cat
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
    __shared__ unsigned s_offset[1024];
    __shared__ unsigned s_offset2[1024];

    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
            if (data[s + N * j] != 0)
        s_offset[threadIdx.x] = count;

//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
        for (int shift = 1; shift < blockDim.x; shift += shift)
            s_offset2[threadIdx.x] = s_offset[threadIdx.x];
            if (threadIdx.x >= shift)
                s_offset[threadIdx.x] += s_offset2[threadIdx.x - shift];
//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

#include <cstdlib>

int main(int argc, char* argv[])
    int NTH = 128;
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;


    for (int k = 0; k < N*M; ++k)
        if ((rand() % 7) == 0) values[k] = ++count ;
        else values[k] = 0;

    printf("count = %d\n", count);
    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
$ nvcc -arch=sm_52 -o t271
$ cuda-memcheck ./t271 1024
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck --tool racecheck ./t271 1024
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)


回答评论中的问题:我同意,无论是否-G(设备调试代码生成)切换,编译似乎都会影响cuda-memcheck --tool racecheck ...是否报告危险。在Linux CUDA 9.0,GTX960(cc5.2)上,我使用原始提供的代码的次要变体进行以下测试用例:

$ cat
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
    __shared__ unsigned s_offset[1024];
#ifdef FIX
    __shared__ unsigned s_offset2[1024];
    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
            if (data[s + N * j] != 0)
        s_offset[threadIdx.x] = count;

//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
#ifndef FIX
        for (int shift = 1; shift < blockDim.x; shift += shift)
            if (threadIdx.x >= shift)
                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];  //line 34
        for (int shift = 1; shift < blockDim.x; shift += shift)
            s_offset2[threadIdx.x] = s_offset[threadIdx.x];
            if (threadIdx.x >= shift)
                s_offset[threadIdx.x] += s_offset2[threadIdx.x - shift];
//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

#include <cstdlib>

int main(int argc, char* argv[])
    int NTH = 128;
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;


    for (int k = 0; k < N*M; ++k)
        if ((rand() % 7) == 0) values[k] = ++count ;
        else values[k] = 0;

    printf("count = %d\n", count);
    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
$ nvcc -G -arch=sm_52 -o t271
$ cuda-memcheck --tool racecheck ./t271 1024
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$ nvcc -lineinfo -arch=sm_52 -o t271
$ cuda-memcheck --tool racecheck ./t271 1024
count = 3145571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [236072 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [1992 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [2369 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [232728 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [913 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [233479 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [1841 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [239007 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [1833 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [228636 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [1689 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [225456 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [2177 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [151696 hazards]
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [1009 hazards]
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/, int, int*) [8064 hazards]
========= RACECHECK SUMMARY: 16 hazards displayed (16 errors, 0 warnings)


  1. cuda-memcheck
  2. 中的错误
  3. 使用-G创建代码的一些特性,它以某种方式完全掩盖了工具中的竞争条件。我确实不确定那是什么。

答案 1 :(得分:0)


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void init_kernel(unsigned* totalcount)
        totalcount [threadIdx.x] = 0;

#define TRACE 0

__global__ void count_zeros_shared(int N, int M, int* data, unsigned* totalcount)
        __shared__ unsigned s_offset[1024];

        for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
                // count non-zero values
                unsigned count = 0;
                for (int j = 0; j < M; ++j)
                        if (data[s + N * j] != 0)
                s_offset[threadIdx.x] = count;

                #if TRACE
                if ((s - threadIdx.x) == 0)
                        printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

                // reduce offset
                for (int shift = 1; shift < blockDim.x; shift += shift)

                        #if 0 // race condition version

                        if (threadIdx.x >= shift)
                                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];


                        int val = s_offset[threadIdx.x];
                        if (threadIdx.x >= shift)
                                val += s_offset[threadIdx.x - shift] ;
                        if (threadIdx.x >= shift)
                                s_offset[threadIdx.x] = val ;


                #if TRACE
                if ((s - threadIdx.x) == 0)
                        printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

                if (threadIdx.x == 0)
                        atomicAdd(totalcount, s_offset[blockDim.x - 1]);



__global__ void printsum_kernel(int N, unsigned* totalcount)
        for (int NTH = 32 ; NTH <= 1024 ; NTH *= 2)
                printf("GPU TOTAL COUNT [BlockDIM = %d] = %d\n", NTH, totalcount[(NTH / 32) - 1]);

#include <cstdlib>

int main(int argc, char* argv[])
        cudaError_t cuerr;

        int* values;
        unsigned* totalcount;
        int N = 1024 * 48, M = 448;

        cuerr = ::cudaMalloc(&totalcount, (1024/32) * sizeof(unsigned)) ;
        if (cuerr != cudaSuccess) return cuerr;

        cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
        if (cuerr != cudaSuccess) return cuerr;

        int count = 0;


        for (int k = 0; k < N*M; ++k)
                if ((rand() % 7) == 0) values[k] = ++count ;
                else values[k] = 0;

        init_kernel << < 1, 1024/32 >> > (totalcount);
        cuerr = ::cudaDeviceSynchronize();
        if (cuerr != cudaSuccess) return cuerr;

        for (int NTH = 32; NTH <= 1024; NTH *= 2)
                printf("RUNNING %d threads per block\n", NTH);

                count_zeros_shared << < N / NTH, NTH >> > (N, M, values, totalcount + ((NTH / 32) - 1));
                cuerr = ::cudaDeviceSynchronize();
                if (cuerr != cudaSuccess) return cuerr;

        printsum_kernel << < 1, 1 >> > (1024/32, totalcount);
        cuerr = ::cudaDeviceSynchronize();
        if (cuerr != cudaSuccess) return cuerr;

        printf("GROUND TRUTH TOTAL COUNT = %d\n", count);

        return ::cudaDeviceReset();


$> nvcc -G -arch sm_61 -o a3.out
$>  cuda-memcheck --version
CUDA-MEMCHECK version 9.0.176 ID:(44)
$> cuda-memcheck --tool racecheck ./a3.out
RUNNING 32 threads per block
RUNNING 64 threads per block
RUNNING 128 threads per block
RUNNING 256 threads per block
RUNNING 512 threads per block
RUNNING 1024 threads per block
GPU TOTAL COUNT [BlockDIM = 32] = 3145571
GPU TOTAL COUNT [BlockDIM = 64] = 3145571
GPU TOTAL COUNT [BlockDIM = 128] = 3145571
GPU TOTAL COUNT [BlockDIM = 256] = 3145571
GPU TOTAL COUNT [BlockDIM = 512] = 3145571
GPU TOTAL COUNT [BlockDIM = 1024] = 3145571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$> ./a3.out
RUNNING 32 threads per block
RUNNING 64 threads per block
RUNNING 128 threads per block
RUNNING 256 threads per block
RUNNING 512 threads per block
RUNNING 1024 threads per block
GPU TOTAL COUNT [BlockDIM = 32] = 3145571
GPU TOTAL COUNT [BlockDIM = 64] = 3145571
GPU TOTAL COUNT [BlockDIM = 128] = 3161857
GPU TOTAL COUNT [BlockDIM = 256] = 3200816
GPU TOTAL COUNT [BlockDIM = 512] = 3231303
GPU TOTAL COUNT [BlockDIM = 1024] = 3925122

$> nvidia-smi
Fri Jan  5 18:29:33 2018
| NVIDIA-SMI 384.98                 Driver Version: 384.98                    |
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1080    Off  | 00000000:02:00.0 Off |                  N/A |
| 29%   44C    P0    39W / 180W |      0MiB /  8112MiB |      0%      Default |

