我正在研究一个随机过程,我希望每次运行程序时,如果CUDA内核中的随机数生成不同的序列。 这与我们在C ++中通过声明所做的类似 seed = time(null) 其次是srand(种子) 和兰德()
我可以通过内核将种子从主机传递给设备但是 这样做的问题是我必须将整个种子数组传递到内核中,每个线程每次都有一个不同的随机种子。有没有办法可以生成随机种子/进程if / machine time或类似内核之外的东西并将其作为种子传递?
答案 0 :(得分:5)
您不需要传递随机种子数组,但是,当您使用cuRAND库时,您可以正确设置curand_init
的序列号参数。例如[免责声明:它是未经测试的功能]
__global__ void generate_random_numbers(float* numbers, unsigned long seed, int Np) {
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < Np) {
curandState state;
curand_init(seed, i, 0, &state);
numbers[i] = curand_uniform(&state);
}
}
如果将curand_init
指令更改为
curand_init(clock64(), i, 0, &state);
修改强>
根据Roger Dahl的评论,我在生成131072
元素数组的四种不同可能性之间进行了比较(Kepler K20c):
以下是代码。产生的时间如下:
861ms
; 852ms
; 866ms
; 2556ms
; 我希望我已正确理解Roger Dahl提出的性能问题。
#include <stdio.h>
#include <curand.h>
#include <curand_kernel.h>
#define DSIZE 8192*16
#define nTPB 256
/***********************/
/* CUDA ERROR CHECKING */
/***********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/*************************/
/* CURAND INITIALIZATION */
/*************************/
__global__ void initCurand(curandState *state, unsigned long seed){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed, idx, 0, &state[idx]);
}
__global__ void testrand1(curandState *state, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
a[idx] = curand_uniform(&state[idx]);
}
__global__ void testrand2(unsigned long seed, float *a){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
curandState state;
curand_init(seed, idx, 0, &state);
a[idx] = curand_uniform(&state);
}
/********/
/* MAIN */
/********/
int main() {
int n_iter = 20;
curandState *devState; gpuErrchk(cudaMalloc((void**)&devState, DSIZE*sizeof(curandState)));
float *d_a; gpuErrchk(cudaMalloc((void**)&d_a, DSIZE*sizeof(float)));
float time;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
initCurand<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, 1);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand1<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(devState, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for separate kernels with multiple random number generation: %3.1f ms \n", time);
cudaEventRecord(start, 0);
for (int i=0; i<n_iter; i++) {
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
testrand2<<<(DSIZE+nTPB-1)/nTPB,nTPB>>>(1, d_a);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("Elapsed time for single kernels for multiple random number generation: %3.1f ms \n", time);
getchar();
}
GTX660上的输出:
Elapsed time for separate kernels: 1960.3 ms
Elapsed time for single kernels: 1536.9 ms
Elapsed time for separate kernels with multiple random number generation: 1576.0 ms
Elapsed time for single kernels for multiple random number generation: 4612.2 ms
GTX570上的输出:
Elapsed time for separate kernels: 957.2 ms
Elapsed time for single kernels: 947.7 ms
Elapsed time for separate kernels with multiple random number generation: 964.6 ms
Elapsed time for single kernels for multiple random number generation: 2839.0 ms
与K20c的表现大致相同。
答案 1 :(得分:1)
每次运行时使用不同的种子应该很简单。确切的方法取决于您使用的是哪个生成器,但是如果您使用其中一个cuRAND生成器,则可以将time_t从时间(NULL)转换为64位整数并将其传递给种子功能。
如果从内核调用生成器,则需要将此种子作为内核参数或__device__
变量传递。然后,您可以使用偏移量curand_init()
或使用skip_ahead()
来获取不同的子序列。
如果您的某个特定发生器不起作用,请发布更多信息。
答案 2 :(得分:0)
您可以为随机数初始化和生成创建多个全局函数。或者创建一个循环来遍历全局函数 例: for(int rns = 0; rns&lt; 5; rns ++){// too seed&#39; loop&#39;次
init << < N, 10 >> > (devState, time(0));
gpuErrchk(cudaMalloc((void**)&gpu_no, N * sizeof(double))); // allocate memory for random numbers on device/GPU
//rndn << < N, 10 >> > (devState, gpu_no);//invoke kernel to launch the random numbers
gpuErrchk(cudaMemcpy(cpu_no, gpu_no, N * sizeof(double), cudaMemcpyDeviceToHost))
} cout << "the transition matrix " << ++generate << " seed generation is: " << init << endl;
这对生成的随机数没有任何明显影响。但是,从长远来看,人们担心没有相关性,也缺乏趋同性。为什么你想在迭代中不止一次播种。 您可以使用库函数生成不同类型的随机数分布,如&#34; curand_uniform&#34; curand_normal,curand_poission等。
我不知道这是否能回答你的问题。