Question

我想使用GPU生成一个整数数组。我在NVIDIA文档中找到了一些解决方案，并在此基础上编写了下面的简单代码。当我运行它时，它运行良好但仅当arraySize变量小于或等于291670时。对于更大的值，调用cudaDeviceSynchronize（）会返回cudaErrorLaunchFailure（错误4） - “未指定的启动失败”。

在我的解决方案中，我需要更长的阵列。

这是对阵列长度或我的错误的一些限制吗？

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand_kernel.h>
#include <helper_cuda.h>
#include <curand.h>
#include <stdio.h>
#include <algorithm>
#include <ctime>
#include <iostream>
#include <cstdlib>
__device__ const int MAX_THREADS_PER_BLOCK = 1024;
__device__ const int MAX_BLOCKS = 65535;
__device__ const unsigned int arraySize = 291670;
__global__ void _rndInit_(unsigned int seed, curandState_t* states, unsigned int arraySize) {
    long tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < arraySize) {
        curand_init(seed, tid, 0, &states[tid]);
        tid += blockDim.x * gridDim.x;
    }
}
void rndInit(unsigned int seed, curandState_t* states, int arraySize) {
    int threads = 128;
    int blocks = std::min((arraySize + threads - 1) / threads, MAX_BLOCKS);
    _rndInit_ <<< blocks, threads >>>(time(0), states, arraySize);
}
int main() {
    curandState_t* d_states;
    cudaError_t cudaStatus;
    checkCudaErrors(cudaMalloc((void**)&d_states, arraySize * sizeof(curandState_t)));

    rndInit(time(0), d_states, arraySize);

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) 
        std::cout << cudaStatus;
    cudaFree(d_states);
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        std::cout << cudaStatus;
        return 1;
    }
    return 0;
}

CUDA内核数组长度有限制吗？

0 个答案: