我想使用GPU生成一个整数数组。 我在NVIDIA文档中找到了一些解决方案,并在此基础上编写了下面的简单代码。 当我运行它时,它运行良好但仅当arraySize变量小于或等于291670时。对于更大的值,调用cudaDeviceSynchronize()会返回cudaErrorLaunchFailure(错误4) - “未指定的启动失败”。
在我的解决方案中,我需要更长的阵列。
这是对阵列长度或我的错误的一些限制吗?
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand_kernel.h>
#include <helper_cuda.h>
#include <curand.h>
#include <stdio.h>
#include <algorithm>
#include <ctime>
#include <iostream>
#include <cstdlib>
__device__ const int MAX_THREADS_PER_BLOCK = 1024;
__device__ const int MAX_BLOCKS = 65535;
__device__ const unsigned int arraySize = 291670;
__global__ void _rndInit_(unsigned int seed, curandState_t* states, unsigned int arraySize) {
long tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < arraySize) {
curand_init(seed, tid, 0, &states[tid]);
tid += blockDim.x * gridDim.x;
}
}
void rndInit(unsigned int seed, curandState_t* states, int arraySize) {
int threads = 128;
int blocks = std::min((arraySize + threads - 1) / threads, MAX_BLOCKS);
_rndInit_ <<< blocks, threads >>>(time(0), states, arraySize);
}
int main() {
curandState_t* d_states;
cudaError_t cudaStatus;
checkCudaErrors(cudaMalloc((void**)&d_states, arraySize * sizeof(curandState_t)));
rndInit(time(0), d_states, arraySize);
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
std::cout << cudaStatus;
cudaFree(d_states);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
std::cout << cudaStatus;
return 1;
}
return 0;
}