奇怪的是,当我在cuda-memcheck
之前没有添加./main
时,程序运行时没有任何警告或错误消息,但是,当我添加它时,它会出现如下错误消息。< / p>
========= Invalid __global__ write of size 8
========= at 0x00000120 in initCurand(curandStateXORWOW*, unsigned long)
========= by thread (9,0,0) in block (3,0,0)
========= Address 0x5005413b0 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204115]
========= Host Frame:./main [0x18e11]
========= Host Frame:./main [0x369b3]
========= Host Frame:./main [0x3403]
========= Host Frame:./main [0x308c]
========= Host Frame:./main [0x30b7]
========= Host Frame:./main [0x2ebb]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
这是我的函数,对代码的简要介绍,我尝试生成随机数并将它们保存到设备变量weights
,然后使用此向量从离散数字中进行采样。
#include<iostream>
#include<curand.h>
#include<curand_kernel.h>
#include<time.h>
using namespace std;
#define num 100
__device__ float weights[num];
// function to define seed
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__device__ void sampling(float *weight, float max_weight, int *index, curandState *state){
int j;
float u;
do{
j = (int)(curand_uniform(state) * (num + 0.999999));
u = curand_uniform(state); //sample from uniform distribution;
}while( u > weight[j]/max_weight);
*index = j;
}
__global__ void test(int *dev_sample, curandState *state){
int idx = threadIdx.x + blockIdx.x * blockDim.x;\
// generate random numbers from uniform distribution and save them to weights
weights[idx] = curand_uniform(&state[idx]);
// run sampling function, in which, weights is an input for the function on each thread
sampling(weights, 1, dev_sample+idx, &state[idx]);
}
int main(){
// define the seed of random generator
curandState *devState;
cudaMalloc((void**)&devState, num*sizeof(curandState));
int *h_sample;
h_sample = (int*) malloc(num*sizeof(int));
int *d_sample;
cudaMalloc((void**)&d_sample, num*sizeof(float));
initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
test<<<(int)num/32 + 1, 32>>>(d_sample, devState);
cudaMemcpy(h_sample, d_sample, num*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < num; ++i)
{
cout << *(h_sample + i) << endl;
}
//free memory
cudaFree(devState);
free(h_sample);
cudaFree(d_sample);
return 0;
}
刚开始学习cuda,如果访问全局内存的方法不正确,请帮助我。感谢
答案 0 :(得分:2)
这是推出&#34;额外&#34;线程:
initCurand<<<(int)num/32 + 1, 32>>>(devState, 1);
num
为100,因此上面的配置将启动4个32个线程的块,即128个线程。但是你只在这里分配100 curandState
的空间:
cudaMalloc((void**)&devState, num*sizeof(curandState));
因此,您的initCurand
内核会有一些线程(idx
= 100-127)尝试初始化您尚未分配的curandState
个。因此,当您运行cuda-memcheck
进行相当严格的越界检查时,会报告错误。
一种可能的解决方案是修改initCurand
内核,如下所示:
__global__ void initCurand(curandState *state, unsigned long seed, int num){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < num)
curand_init(seed, idx, 0, &state[idx]);
}
这将防止任何越界线程做任何事情。请注意,您需要修改内核调用以将num
传递给它。此外,在我看来,您的test
内核中存在类似的问题。您可能想要做类似的事情来修复它。这是CUDA内核中的常见构造,我称之为&#34;线程检查&#34;。您可以在SO标签上找到其他问题,讨论同样的概念。