Question

奇怪的是，当我在cuda-memcheck之前没有添加./main时，程序运行时没有任何警告或错误消息，但是，当我添加它时，它会出现如下错误消息。< / p>

========= Invalid __global__ write of size 8
=========     at 0x00000120 in initCurand(curandStateXORWOW*, unsigned long)
=========     by thread (9,0,0) in block (3,0,0)
=========     Address 0x5005413b0 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204115]
=========     Host Frame:./main [0x18e11]
=========     Host Frame:./main [0x369b3]
=========     Host Frame:./main [0x3403]
=========     Host Frame:./main [0x308c]
=========     Host Frame:./main [0x30b7]
=========     Host Frame:./main [0x2ebb]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]

这是我的函数，对代码的简要介绍，我尝试生成随机数并将它们保存到设备变量weights，然后使用此向量从离散数字中进行采样。

#include<iostream>
#include<curand.h>
#include<curand_kernel.h>
#include<time.h>

using namespace std;

#define num 100


__device__ float weights[num];

// function to define seed
__global__ void initCurand(curandState *state, unsigned long seed){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, idx, 0, &state[idx]);
}


__device__ void sampling(float *weight, float max_weight, int *index, curandState *state){
    int j;
    float u;    
    do{
        j = (int)(curand_uniform(state) * (num + 0.999999)); 
        u = curand_uniform(state); //sample from uniform distribution;
    }while( u > weight[j]/max_weight);
    *index  = j;
}

__global__ void test(int *dev_sample, curandState *state){
    int idx     = threadIdx.x + blockIdx.x * blockDim.x;\
    // generate random numbers from uniform distribution and save them to weights
    weights[idx]    = curand_uniform(&state[idx]);
    // run sampling function, in which, weights is an input for the function on each thread
    sampling(weights, 1, dev_sample+idx, &state[idx]);
}


int main(){ 
    // define the seed of random generator
    curandState *devState;  
    cudaMalloc((void**)&devState, num*sizeof(curandState));

    int *h_sample;
    h_sample    = (int*) malloc(num*sizeof(int));

    int *d_sample;
    cudaMalloc((void**)&d_sample, num*sizeof(float));

    initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
    test<<<(int)num/32 + 1, 32>>>(d_sample, devState);

    cudaMemcpy(h_sample, d_sample, num*sizeof(float), cudaMemcpyDeviceToHost);

    for (int i = 0; i < num; ++i)
    {
        cout << *(h_sample + i) << endl;
    }

    //free memory
    cudaFree(devState);
    free(h_sample);
    cudaFree(d_sample);
    return 0;
}

刚开始学习cuda，如果访问全局内存的方法不正确，请帮助我。感谢

Answer 1

这是推出＆＃34;额外＆＃34;线程：

initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);

num为100，因此上面的配置将启动4个32个线程的块，即128个线程。但是你只在这里分配100 curandState的空间：

cudaMalloc((void**)&devState, num*sizeof(curandState));

因此，您的initCurand内核会有一些线程（idx = 100-127）尝试初始化您尚未分配的curandState个。因此，当您运行cuda-memcheck进行相当严格的越界检查时，会报告错误。

一种可能的解决方案是修改initCurand内核，如下所示：

__global__ void initCurand(curandState *state, unsigned long seed, int num){
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < num)
        curand_init(seed, idx, 0, &state[idx]);
}

这将防止任何越界线程做任何事情。请注意，您需要修改内核调用以将num传递给它。此外，在我看来，您的test内核中存在类似的问题。您可能想要做类似的事情来修复它。这是CUDA内核中的常见构造，我称之为＆＃34;线程检查＆＃34;。您可以在SO标签上找到其他问题，讨论同样的概念。

cuda 8.0中'cuda-memcheck'出错

1 个答案: