Question

我是C ++和CUDA的初学者。我试图编写一个可以计算素数的程序。该算法本身有效，但是我无法从GPU获得结果：内核停止计算后，我尝试使用cudaMemcpy将数据复制回主机，但是会抛出cudaErrorInvalidValue。

我的工作：由于我不知道确切会返回多少个质数，因此我决定制作一个结构，该结构将分配内存，存储指向内存/已用大小的指针，并具有一个Add内核功能。

template <typename T>
struct KernelArray
{
    T*  array = nullptr;
    int* size = nullptr;
    const int capacity;

    KernelArray(const int &capacity)
        : capacity(capacity)
    {
        checkCudaErrors(cudaMalloc(&array, capacity * sizeof(T))); // this all works
        checkCudaErrors(cudaMalloc(&size, sizeof(int)));
        checkCudaErrors(cudaDeviceSynchronize());
    }

    ~KernelArray()
    {
        checkCudaErrors(cudaFree(array));
        checkCudaErrors(cudaFree(size));
    }

    void CopyToDevice(const T* arr, const int &size) // this works too
    {
        if (size > capacity) throw std::invalid_argument("argument 'size' is bigger than allocated memory size");
        checkCudaErrors(cudaMemcpy(array, arr, size * sizeof(T), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(this->size, &size, sizeof(int), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaDeviceSynchronize());
    }

    int GetSizeFromDevice() const
    {
        int* toReturn = (int*)malloc(sizeof(int));
        checkCudaErrors(cudaMemcpy(toReturn, size, sizeof(int), cudaMemcpyDeviceToHost)); // there is an error
        checkCudaErrors(cudaDeviceSynchronize());
        int ret = *toReturn;
        free(toReturn);
        return ret;
    }

    /* ... */

    __device__ void dAdd(const T &a)
    {
        if (*size == capacity) return;
        array[(*size)++] = a;
    }
};

在main中，我预先计算了一些质数以将其推入GPU，并将其传递到结构内部的内核中。

KernelArray<int> devPrimes(N / 4);
devPrimes.CopyToDevice(firstPrimes.data(), firstPrimes.size()); //from vector

findPrimesKernel <<<1, 1>>> (nSqrt, N, devPrimes);
auto cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
    fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
checkCudaErrors(cudaDeviceSynchronize());

int arrivedSize = devPrimes.GetSizeFromDevice(); // error when it steps inside here
int* arrivedArray = (int*)malloc(arrivedSize*sizeof(int));
devPrimes.CopyFromDevice(arrivedArray);

printf("last device primes: %d %d %d", arrivedArray[arrivedSize-3], arrivedArray[arrivedSize - 2], arrivedArray[arrivedSize - 1]);

内核代码检查数字，但仅使用KernelArray::dAdd函数更改素数数组。内核完成而没有错误。

所以导致错误的代码是这样（发生的唯一错误）：

int GetSizeFromDevice() const
{
    int* toReturn = (int*)malloc(sizeof(int));
    checkCudaErrors(cudaMemcpy(toReturn, size, sizeof(int), cudaMemcpyDeviceToHost)); // this line
    // CUDA error at .../kernel.cu:46 code=11(cudaErrorInvalidValue) "cudaMemcpy(toReturn, size, sizeof(int), cudaMemcpyDeviceToHost)"
    checkCudaErrors(cudaDeviceSynchronize());
    int ret = *toReturn;
    free(toReturn);
    return ret;
}

内核功能：

_global__ void findPrimesKernel(int from, int to, KernelArray<int> primes)
{
    printf("Start..\n");
    for (int i = from+threadIdx.x; i <= to; i+=blockDim.x) 
    {
        for (int p = 0; p < *primes.size; ++p) 
        {
            auto prime = primes.array[p];
            if (i % prime == 0) {
                break; 
            }

            if (i < prime*prime)  
            {
                printf("[%d] found prime %d (size: %d, prime^2: %d^2)\n",threadIdx.x, i, *primes.size, prime);
                primes.dAdd(i);
                break;
            }

        }
    }
    printf("Done from %d to %d)\n", from, to);
}

有什么想法为什么不起作用，或者我应该怎么解决才能使其起作用？

谢谢！

当我尝试从设备复制时，cudaMemcpy返回cudaErrorInvalidValue

0 个答案: