在GPU上使用CUDA生成白噪声

时间:2014-03-31 18:02:30

标签: cuda

我想使用CUDA生成白噪声(正态分布)。以下是我的尝试。

enter code here

#define SCALE 1.0
#define SHIFT 0.0
#define BLOCKS 64
#define THREADS 64

__global__ void setup_kernel(curandState *state)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(7+id, id, 0, &state[id]);
}

__global__ void generate_normal_kernel(curandState *state, int *result)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
float x;
curandState localState = state[id];
for(int n = 0; n < 100000; n++) {
x = (curand_normal(&localState) * SCALE)+SHIFT;
}
state[id] = localState;
result[id] = (int) x;
}


int main(int argc, char *argv[])
{
int i;
unsigned int total;
curandState *devStates;
int *devResults, *hostResults;
int device;
struct cudaDeviceProp properties;

CUDA_CALL(cudaGetDevice(&device));
CUDA_CALL(cudaGetDeviceProperties(&properties,device));

hostResults = (int *)calloc(THREADS * BLOCKS, sizeof(int));

CUDA_CALL(cudaMalloc((void **)&devResults, BLOCKS * THREADS *
sizeof(int)));

CUDA_CALL(cudaMemset(devResults, 0, THREADS * BLOCKS *
sizeof(int)));

CUDA_CALL(cudaMalloc((void **)&devStates, THREADS * BLOCKS *
sizeof(curandState)));

setup_kernel<<<BLOCKS, THREADS>>>(devStates);

generate_normal_kernel<<<BLOCKS, THREADS>>>(devStates, devResults);

CUDA_CALL(cudaMemcpy(hostResults, devResults, BLOCKS * THREADS *
sizeof(int), cudaMemcpyDeviceToHost));


I_TCS = ITCSAmp*hostResults;


/* Cleanup */
CUDA_CALL(cudaFree(devStates));
CUDA_CALL(cudaFree(devResults));
free(hostResults);
return EXIT_SUCCESS;
}

=============================================== ================================

但我收到了以下错误,

错误:标识符“CUDA_CALL”未定义

错误:表达式必须具有算术或枚举类型

错误:表达式必须具有算术或枚举类型

错误:表达式必须具有算术或枚举类型

警告:变量“total”已声明但从未引用

错误:标识符“devStates”未定义

错误:标识符“CUDA_CALL”未定义

错误:标识符“devResults”未定义

错误:标识符“hostResults”未定义

它认为我已经定义了它们,但显然它没有用。如果您有任何建议或知道如何更改代码,我将非常感谢您的帮助!

1 个答案:

答案 0 :(得分:1)

请在下面找到一个可编译和可执行的代码,在CUDA中生成具有正态分布的随机数。它是您在上面发布的代码的修改。一些已更改的说明在其旧版本中进行了注释。

我已根据What is the canonical way to check for errors using the CUDA runtime API?CUDA_CALL更改为gpuErrchk

我认为您误解了curand_init语法并修复了它。此外,setup_kernel内核错过了seed,因此我添加了它。

我简化了您的generate_normal_kernel内核:我认为for循环重复计算x不可用。

curand_normal返回float s,而非int s,实际上整数的正态分布未定义。我相应地更改了相关的变量类型。

#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#include <time.h>

#define SCALE 1.0f
#define SHIFT 0.0f
#define BLOCKS 64
#define THREADS 64

/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
        if (abort) exit(code);
    }
}

/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void setup_kernel(unsigned long seed, curandState *state)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    curand_init(seed, id, 0, &state[id]);
//  curand_init(7+id, id, 0, &state[id]);
}

/*****************************************/
/* RANDOM DISTRIBUTION GENERATION KERNEL */
/*****************************************/
__global__ void generate_normal_kernel(curandState *state, float *result)
{
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    result[id] = (curand_normal(&state[id])*SCALE)+SHIFT;
}

/********/
/* MAIN */
/********/
void main()
{
    float* hostResults = (float*)calloc(THREADS * BLOCKS, sizeof(float));

    float *devResults; gpuErrchk(cudaMalloc((void**)&devResults, BLOCKS * THREADS * sizeof(float)));
    gpuErrchk(cudaMemset(devResults, 0, THREADS * BLOCKS * sizeof(float)));

    curandState *devStates; gpuErrchk(cudaMalloc((void **)&devStates, THREADS * BLOCKS * sizeof(curandState)));

    setup_kernel<<<BLOCKS, THREADS>>>(time(NULL),devStates);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    generate_normal_kernel<<<BLOCKS, THREADS>>>(devStates, devResults);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(hostResults, devResults, BLOCKS * THREADS * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i=0; i<THREADS*BLOCKS; i++) printf("rand[%i] = %f\n", i, hostResults[i]);

    /* Cleanup */
    gpuErrchk(cudaFree(devStates));
    gpuErrchk(cudaFree(devResults));

    free(hostResults);

    getchar();
 }