我是C ++和CUDA的初学者。我试图编写一个可以计算素数的程序。该算法本身有效,但是我无法从GPU获得结果:内核停止计算后,我尝试使用cudaMemcpy
将数据复制回主机,但是会抛出cudaErrorInvalidValue
。
我的工作:由于我不知道确切会返回多少个质数,因此我决定制作一个结构,该结构将分配内存,存储指向内存/已用大小的指针,并具有一个Add
内核功能。
template <typename T>
struct KernelArray
{
T* array = nullptr;
int* size = nullptr;
const int capacity;
KernelArray(const int &capacity)
: capacity(capacity)
{
checkCudaErrors(cudaMalloc(&array, capacity * sizeof(T))); // this all works
checkCudaErrors(cudaMalloc(&size, sizeof(int)));
checkCudaErrors(cudaDeviceSynchronize());
}
~KernelArray()
{
checkCudaErrors(cudaFree(array));
checkCudaErrors(cudaFree(size));
}
void CopyToDevice(const T* arr, const int &size) // this works too
{
if (size > capacity) throw std::invalid_argument("argument 'size' is bigger than allocated memory size");
checkCudaErrors(cudaMemcpy(array, arr, size * sizeof(T), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(this->size, &size, sizeof(int), cudaMemcpyHostToDevice));
checkCudaErrors(cudaDeviceSynchronize());
}
int GetSizeFromDevice() const
{
int* toReturn = (int*)malloc(sizeof(int));
checkCudaErrors(cudaMemcpy(toReturn, size, sizeof(int), cudaMemcpyDeviceToHost)); // there is an error
checkCudaErrors(cudaDeviceSynchronize());
int ret = *toReturn;
free(toReturn);
return ret;
}
/* ... */
__device__ void dAdd(const T &a)
{
if (*size == capacity) return;
array[(*size)++] = a;
}
};
在main
中,我预先计算了一些质数以将其推入GPU,并将其传递到结构内部的内核中。
KernelArray<int> devPrimes(N / 4);
devPrimes.CopyToDevice(firstPrimes.data(), firstPrimes.size()); //from vector
findPrimesKernel <<<1, 1>>> (nSqrt, N, devPrimes);
auto cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
}
checkCudaErrors(cudaDeviceSynchronize());
int arrivedSize = devPrimes.GetSizeFromDevice(); // error when it steps inside here
int* arrivedArray = (int*)malloc(arrivedSize*sizeof(int));
devPrimes.CopyFromDevice(arrivedArray);
printf("last device primes: %d %d %d", arrivedArray[arrivedSize-3], arrivedArray[arrivedSize - 2], arrivedArray[arrivedSize - 1]);
内核代码检查数字,但仅使用KernelArray::dAdd
函数更改素数数组。内核完成而没有错误。
所以导致错误的代码是这样(发生的唯一错误):
int GetSizeFromDevice() const
{
int* toReturn = (int*)malloc(sizeof(int));
checkCudaErrors(cudaMemcpy(toReturn, size, sizeof(int), cudaMemcpyDeviceToHost)); // this line
// CUDA error at .../kernel.cu:46 code=11(cudaErrorInvalidValue) "cudaMemcpy(toReturn, size, sizeof(int), cudaMemcpyDeviceToHost)"
checkCudaErrors(cudaDeviceSynchronize());
int ret = *toReturn;
free(toReturn);
return ret;
}
内核功能:
_global__ void findPrimesKernel(int from, int to, KernelArray<int> primes)
{
printf("Start..\n");
for (int i = from+threadIdx.x; i <= to; i+=blockDim.x)
{
for (int p = 0; p < *primes.size; ++p)
{
auto prime = primes.array[p];
if (i % prime == 0) {
break;
}
if (i < prime*prime)
{
printf("[%d] found prime %d (size: %d, prime^2: %d^2)\n",threadIdx.x, i, *primes.size, prime);
primes.dAdd(i);
break;
}
}
}
printf("Done from %d to %d)\n", from, to);
}
有什么想法为什么不起作用,或者我应该怎么解决才能使其起作用?
谢谢!