我们如何在每次运行中使用不同的种子在CUDA C中生成随机数?

时间:2014-03-15 14:51:07

标签: cuda nvidia gpu-programming

我正在研究一个随机过程,我希望每次运行程序时,如果CUDA内核中的随机数生成不同的序列。 这与我们在C ++中通过声明所做的类似 seed = time(null) 其次是srand(种子) 和兰德()

我可以通过内核将种子从主机传递给设备但是 这样做的问题是我必须将整个种子数组传递到内核中,每个线程每次都有一个不同的随机种子。有没有办法可以生成随机种子/进程if / machine time或类似内核之外的东西并将其作为种子传递?

3 个答案:

答案 0 :(得分:5)

您不需要传递随机种子数组,但是,当您使用cuRAND库时,您可以正确设置curand_init的序列号参数。例如[免责声明:它是未经测试的功能]

__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {

    int i = threadIdx.x + blockIdx.x * blockDim.x;

    if (i < Np) {

        curandState state;

        curand_init(seed, i, 0, &state);

        numbers[i] = curand_uniform(&state);
    }
}

如果将curand_init指令更改为

,也可以避免从外部传递种子
curand_init(clock64(), i, 0, &state);

修改

根据Roger Dahl的评论,我在生成131072元素数组的四种不同可能性之间进行了比较(Kepler K20c):

  1. 单个随机数生成:用于初始化和随机数生成的单独内核;
  2. 单个随机数生成:用于初始化和随机数生成的唯一内核;
  3. 多个随机数生成:用于初始化和随机数生成的独立内核;
  4. 多个随机数生成:用于初始化和随机数生成的唯一内核;
  5. 以下是代码。产生的时间如下:

    1. 861ms;
    2. 852ms;
    3. 866ms;
    4. 2556ms;
    5. 我希望我已正确理解Roger Dahl提出的性能问题。

      #include <stdio.h>
      #include <curand.h>
      #include <curand_kernel.h>
      
      #define DSIZE 8192*16
      #define nTPB 256
      
      /***********************/
      /* CUDA ERROR CHECKING */
      /***********************/
      #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
      inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
      {
          if (code != cudaSuccess) 
          {
              fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
              if (abort) exit(code);
          }
      }
      
      /*************************/
      /* CURAND INITIALIZATION */
      /*************************/
      __global__ void initCurand(curandState *state, unsigned long seed){
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          curand_init(seed, idx, 0, &state[idx]);
      }
      
      __global__ void testrand1(curandState *state, float *a){
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          a[idx] = curand_uniform(&state[idx]);
      }
      
      __global__ void testrand2(unsigned long seed, float *a){
          int idx = threadIdx.x + blockIdx.x * blockDim.x;
          curandState state;
          curand_init(seed, idx, 0, &state);
          a[idx] = curand_uniform(&state);
      }
      
      /********/
      /* MAIN */
      /********/
      int main() {
      
          int n_iter = 20;
      
          curandState *devState;  gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
      
          float *d_a;             gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
      
          float time;
          cudaEvent_t start, stop;
      
          cudaEventCreate(&start);
          cudaEventCreate(&stop);
          cudaEventRecord(start, 0);
      
          for (int i=0; i<n_iter; i++) {
      
              initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
              testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
          }
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&time, start, stop);
          printf("Elapsed time for separate kernels:  %3.1f ms \n", time);
      
          cudaEventRecord(start, 0);
      
          for (int i=0; i<n_iter; i++) {
      
              testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
          }
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&time, start, stop);
          printf("Elapsed time for single kernels:  %3.1f ms \n", time);
      
          cudaEventRecord(start, 0);
      
          for (int i=0; i<n_iter; i++) {
      
              initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
              testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
              testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
              testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
          }
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&time, start, stop);
          printf("Elapsed time for separate kernels with multiple random number generation:  %3.1f ms \n", time);
      
          cudaEventRecord(start, 0);
      
          for (int i=0; i<n_iter; i++) {
      
              testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
              testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
      
              testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
              gpuErrchk(cudaPeekAtLastError());
              gpuErrchk(cudaDeviceSynchronize());
          }
      
          cudaEventRecord(stop, 0);
          cudaEventSynchronize(stop);
          cudaEventElapsedTime(&time, start, stop);
          printf("Elapsed time for single kernels for multiple random number generation:  %3.1f ms \n", time);
      
          getchar();
      }
      

      GTX660上的输出:

      Elapsed time for separate kernels:  1960.3 ms
      Elapsed time for single kernels:  1536.9 ms
      Elapsed time for separate kernels with multiple random number generation:  1576.0 ms
      Elapsed time for single kernels for multiple random number generation:  4612.2 ms
      

      GTX570上的输出:

      Elapsed time for separate kernels:  957.2 ms 
      Elapsed time for single kernels:  947.7 ms 
      Elapsed time for separate kernels with multiple random number generation:  964.6 ms 
      Elapsed time for single kernels for multiple random number generation:  2839.0 ms 
      

      与K20c的表现大致相同。

答案 1 :(得分:1)

每次运行时使用不同的种子应该很简单。确切的方法取决于您使用的是哪个生成器,但是如果您使用其中一个cuRAND生成器,则可以将time_t从时间(NULL)转换为64位整数并将其传递给种子功能。

如果从内核调用生成器,则需要将此种子作为内核参数或__device__变量传递。然后,您可以使用偏移量curand_init()或使用skip_ahead()来获取不同的子序列。

如果您的某个特定发生器不起作用,请发布更多信息。

答案 2 :(得分:0)

您可以为随机数初始化和生成创建多个全局函数。或者创建一个循环来遍历全局函数 例: for(int rns = 0; rns&lt; 5; rns ++){// too seed&#39; loop&#39;次

    init << < N, 10 >> > (devState, time(0));
    gpuErrchk(cudaMalloc((void**)&gpu_no, N * sizeof(double))); // allocate memory for random numbers on device/GPU
//rndn << < N, 10 >> > (devState, gpu_no);//invoke kernel to launch the random numbers
    gpuErrchk(cudaMemcpy(cpu_no, gpu_no, N * sizeof(double), cudaMemcpyDeviceToHost))
} cout << "the transition matrix  " << ++generate << " seed generation is:  " << init << endl;

这对生成的随机数没有任何明显影响。但是,从长远来看,人们担心没有相关性,也缺乏趋同性。为什么你想在迭代中不止一次播种。 您可以使用库函数生成不同类型的随机数分布,如&#34; curand_uniform&#34; curand_normal,curand_poission等。

我不知道这是否能回答你的问题。