我想使用CUDA生成白噪声(正态分布)。以下是我的尝试。
enter code here
#define SCALE 1.0
#define SHIFT 0.0
#define BLOCKS 64
#define THREADS 64
__global__ void setup_kernel(curandState *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(7+id, id, 0, &state[id]);
}
__global__ void generate_normal_kernel(curandState *state, int *result)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
float x;
curandState localState = state[id];
for(int n = 0; n < 100000; n++) {
x = (curand_normal(&localState) * SCALE)+SHIFT;
}
state[id] = localState;
result[id] = (int) x;
}
int main(int argc, char *argv[])
{
int i;
unsigned int total;
curandState *devStates;
int *devResults, *hostResults;
int device;
struct cudaDeviceProp properties;
CUDA_CALL(cudaGetDevice(&device));
CUDA_CALL(cudaGetDeviceProperties(&properties,device));
hostResults = (int *)calloc(THREADS * BLOCKS, sizeof(int));
CUDA_CALL(cudaMalloc((void **)&devResults, BLOCKS * THREADS *
sizeof(int)));
CUDA_CALL(cudaMemset(devResults, 0, THREADS * BLOCKS *
sizeof(int)));
CUDA_CALL(cudaMalloc((void **)&devStates, THREADS * BLOCKS *
sizeof(curandState)));
setup_kernel<<<BLOCKS, THREADS>>>(devStates);
generate_normal_kernel<<<BLOCKS, THREADS>>>(devStates, devResults);
CUDA_CALL(cudaMemcpy(hostResults, devResults, BLOCKS * THREADS *
sizeof(int), cudaMemcpyDeviceToHost));
I_TCS = ITCSAmp*hostResults;
/* Cleanup */
CUDA_CALL(cudaFree(devStates));
CUDA_CALL(cudaFree(devResults));
free(hostResults);
return EXIT_SUCCESS;
}
=============================================== ================================
但我收到了以下错误,
错误:标识符“CUDA_CALL”未定义
错误:表达式必须具有算术或枚举类型
错误:表达式必须具有算术或枚举类型
错误:表达式必须具有算术或枚举类型
警告:变量“total”已声明但从未引用
错误:标识符“devStates”未定义
错误:标识符“CUDA_CALL”未定义
错误:标识符“devResults”未定义
错误:标识符“hostResults”未定义
它认为我已经定义了它们,但显然它没有用。如果您有任何建议或知道如何更改代码,我将非常感谢您的帮助!
答案 0 :(得分:1)
请在下面找到一个可编译和可执行的代码,在CUDA中生成具有正态分布的随机数。它是您在上面发布的代码的修改。一些已更改的说明在其旧版本中进行了注释。
我已根据What is the canonical way to check for errors using the CUDA runtime API?将CUDA_CALL
更改为gpuErrchk
。
我认为您误解了curand_init
语法并修复了它。此外,setup_kernel
内核错过了seed
,因此我添加了它。
我简化了您的generate_normal_kernel
内核:我认为for
循环重复计算x
不可用。
curand_normal
返回float
s,而非int
s,实际上整数的正态分布未定义。我相应地更改了相关的变量类型。
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <time.h>
#define SCALE 1.0f
#define SHIFT 0.0f
#define BLOCKS 64
#define THREADS 64
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void setup_kernel(unsigned long seed, curandState *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, id, 0, &state[id]);
// curand_init(7+id, id, 0, &state[id]);
}
/*****************************************/
/* RANDOM DISTRIBUTION GENERATION KERNEL */
/*****************************************/
__global__ void generate_normal_kernel(curandState *state, float *result)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
result[id] = (curand_normal(&state[id])*SCALE)+SHIFT;
}
/********/
/* MAIN */
/********/
void main()
{
float* hostResults = (float*)calloc(THREADS * BLOCKS, sizeof(float));
float *devResults; gpuErrchk(cudaMalloc((void**)&devResults, BLOCKS * THREADS * sizeof(float)));
gpuErrchk(cudaMemset(devResults, 0, THREADS * BLOCKS * sizeof(float)));
curandState *devStates; gpuErrchk(cudaMalloc((void **)&devStates, THREADS * BLOCKS * sizeof(curandState)));
setup_kernel<<<BLOCKS, THREADS>>>(time(NULL),devStates);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
generate_normal_kernel<<<BLOCKS, THREADS>>>(devStates, devResults);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(hostResults, devResults, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost));
for (int i=0; i<THREADS*BLOCKS; i++) printf("rand[%i] = %f\n", i, hostResults[i]);
/* Cleanup */
gpuErrchk(cudaFree(devStates));
gpuErrchk(cudaFree(devResults));
free(hostResults);
getchar();
}