Question

当一些代码开始提供不同的结果时，我正在升级到CUDA 8.0。我设法找到大致复制MCVE的问题并解决我的问题。

#include <cub/cub.cuh> // Tested with cub 1.5.5

#include <stdio.h>

static inline void f(cudaError_t err, const char *file, int line)
{
    if (err != cudaSuccess) {
        fprintf(stderr, "ERROR in file %s, line %d: %s (%d)\n", file, line, cudaGetErrorString(err), err);
        fprintf(stdout, "ERROR in file %s, line %d: %s (%d)\n", file, line, cudaGetErrorString(err), err);
    }
}

#define CHKERR(expr) do {f(expr, __FILE__, __LINE__);} while(0)

template<int dimSize>
__device__ __inline__ void UsedToWork(double *s_arr)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    typedef cub::BlockReduce<double, dimSize> BlockReduce;
    __shared__ typename BlockReduce::TempStorage temp_storage;

    // This following line was the issue
    double r = BlockReduce(temp_storage).Sum(s_arr[idx], dimSize);
    __syncthreads();
    if (idx == 0)
        printf("t0 here %f\n\n", r);
}

template<int size>
__global__ void ShouldWork(double *input)
{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;

    __shared__ double s_arr[size];
    if (idx < size)
        s_arr[idx] = input[idx];
    __syncthreads();

    UsedToWork<size>(s_arr);
}

int main()
{
    const int arraySize = 32;
    double h[arraySize] = { 
         1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
        11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
        21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
        31, 32
    };

    double *d = 0;
    cudaError_t cudaStatus;

    CHKERR(cudaMalloc((void**)&d, arraySize * sizeof(double)));
    CHKERR(cudaMemcpy(d, h, arraySize * sizeof(double), cudaMemcpyHostToDevice));

    ShouldWork<32><<<1, arraySize * 2 >>>(d);

    CHKERR(cudaGetLastError());
    CHKERR(cudaDeviceSynchronize());
    CHKERR(cudaFree(d));

    return 0;
}

我用

替换了感兴趣的行

double r = BlockReduce(temp_storage).Sum((idx < dimSize ? s_arr[idx] : 0.), dimSize);

确保如果idx大于dimSize（数组的大小），则不会访问an illegal memory access was encountered (77)。虽然这显然是一个错误，但为什么CUDA 7.5首先允许内存访问而没有问题？只是为了让事情更有趣，如果在内核中我替换了

UsedToWork<size>(s_arr);

调用它（无论如何应该内联）定义

typedef cub::BlockReduce<double, size> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
double r = BlockReduce(temp_storage).Sum(s_arr[idx], size);
__syncthreads();

CUDA 8.0没有给我an illegal memory access was encountered (77)错误。现在我只是困惑。这种行为至少应该不一致吗？

在Windows 7，VS2013上编译。用369.30在泰坦上跑。

Answer 1

GPU有一个运行时内存检查器，其详细信息未发布。这个内存检查工具不是很精确，但是如果发生了足够严重的错误（例如，通过足够的边界进行越界访问），则运行时内存检查将标记错误，停止内核，并声明上下文是损坏。

这将发生的特定条件未发布，可能从GPU架构到GPU架构，从CUDA版本到CUDA版本，以及其他可能的因素。

正如评论中推测的那样，运行时错误检查的一种可能机制可能是GPU代码触及与其上下文无关的内存。然后，给定的数组越界索引可能依赖于该数组碰巧位于上下文内存映射中的位置，以确定特定的越界范围是否实际上超出了上下文。

这样的内存映射很可能从CUDA版本到CUDA版本，GPU架构到架构，甚至可能取决于特定的编译开关。

为了获得最佳（最严格）内存访问有效性检查，建议使用cuda-memcheck工具。例如，如果在cuda-memcheck下运行，那么通过所有CUDA运行时错误检查的代码将失败（并且实际上具有实际的编码缺陷），这是非常可能的。

没有声明保证GPU将在正常操作中检测到无效的内存访问。它当然有一定的能力，但它并不完美。我相信可以对我熟悉的操作环境的主机代码做出类似的陈述。

为什么CUDA 8.0（有时）存储器访问不良，而7.5不存在？

1 个答案: