CUDA内核数组长度有限制吗?

时间:2018-03-15 14:40:18

标签: arrays cuda

我想使用GPU生成一个整数数组。 我在NVIDIA文档中找到了一些解决方案,并在此基础上编写了下面的简单代码。 当我运行它时,它运行良好但仅当arraySize变量小于或等于291670时。对于更大的值,调用cudaDeviceSynchronize()会返回cudaErrorLaunchFailure(错误4) - “未指定的启动失败”。

在我的解决方案中,我需要更长的阵列。

这是对阵列长度或我的错误的一些限制吗?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <curand_kernel.h>
#include <helper_cuda.h>
#include <curand.h>
#include <stdio.h>
#include <algorithm>
#include <ctime>
#include <iostream>
#include <cstdlib>
__device__ const int MAX_THREADS_PER_BLOCK = 1024;
__device__ const int MAX_BLOCKS = 65535;
__device__ const unsigned int arraySize = 291670;
__global__ void _rndInit_(unsigned int seed, curandState_t* states, unsigned int arraySize) {
    long tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < arraySize) {
        curand_init(seed, tid, 0, &states[tid]);
        tid += blockDim.x * gridDim.x;
    }
}
void rndInit(unsigned int seed, curandState_t* states, int arraySize) {
    int threads = 128;
    int blocks = std::min((arraySize + threads - 1) / threads, MAX_BLOCKS);
    _rndInit_ <<< blocks, threads >>>(time(0), states, arraySize);
}
int main() {
    curandState_t* d_states;
    cudaError_t cudaStatus;
    checkCudaErrors(cudaMalloc((void**)&d_states, arraySize * sizeof(curandState_t)));

    rndInit(time(0), d_states, arraySize);

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) 
        std::cout << cudaStatus;
    cudaFree(d_states);
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        std::cout << cudaStatus;
        return 1;
    }
    return 0;
}

0 个答案:

没有答案