我正在尝试在CUDA中实现Bitonic Sorter,并根据分配的数组大小来衡量时间,但是每当我尝试在32 * 32768 * 512(大小为32 * 32768,每个中有512个线程),它将返回错误719。
这是我根据最小,可验证和完整准则更改的程序的简化代码。我怀疑它可能会进一步简化,但是我不确定该怎么做。我保留了默认VSNvidia CUDA项目模板中的内容,因为我认为它对于处理错误很有用。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <thrust/remove.h>
#include <thrust/merge.h>
#include <cstdlib>
#include <cstdio>
#include <thrust\partition.h>
#include <thrust\execution_policy.h>
#include <time.h>
using namespace std;
cudaError_t BitonicSortCuda(int* a, std::size_t size);
#define BLOCKS 32*32768LL//32768LL*32LL
#define THREADS 512LL
#define ELEMS BLOCKS*THREADS
int random(int min, int max) //range : [min, max)
{
static bool first = true;
if (first)
{
srand(time(NULL)); //seeding for the first time only
first = false;
}
return min + rand() % ((max + 1) - min);
}
__global__ void BitonicStepKernel(int *arr1, int j, int k) //parallel part for swapping
{
std::size_t i, ixj;
i = threadIdx.x + blockDim.x * blockIdx.x;//calculate index of thread in the memory
ixj = i ^ j;//multiply by element number
if ((ixj) > i) {
if ((i&k) == 0) {// we form bitonic sequence using the switches.
if (arr1[i] > arr1[ixj]) {
int temp = arr1[i];
arr1[i] = arr1[ixj];
arr1[ixj] = temp;
}
}
if ((i&k) != 0) {
if (arr1[i] < arr1[ixj]) {
int temp = arr1[i];
arr1[i] = arr1[ixj];
arr1[ixj] = temp;
}
}
}
}
void BitonicSort(int *arr1, std::size_t size)// main bitonic sort function
{
dim3 blockdim(BLOCKS, 1);
dim3 threaddim(THREADS, 1);
int j, k;
for (k = 2; k <= ELEMS; k <<= 1) { //split into k partitions
for (j = k >> 1; j > 0; j = j >> 1) {
BitonicStepKernel << <blockdim, threaddim >> > (arr1, j, k);
//cout << (cudaGetErrorString(cudaGetLastError()));
//cudaDeviceSynchronize();// wait for all the parallel parts to finish to avoid errors
}
}
}
void print_elapsed(clock_t start, clock_t stop)// printing the time
{
double elapsed = ((double)(stop - start)) / CLOCKS_PER_SEC;
printf("\nElapsed time: %.3fs\n", elapsed);
}
int main()
{
//const std::size_t num = 512 * BLOCKS;
// Next, we initialize 3 arrays and print the first 10 numbers from them
int* d = new int[ELEMS];
for (std::size_t i = 0; i < ELEMS; i++) {
d[i] = (int)random(0, 100);
}
cout << "\n" << "First 1024 numbers of the first array:" << "\n";
for (int i = 0; i < 1024; i++) {
cout << d[i] << " ";
}
// Then, we call the sorting function for all 3 of them
cudaError_t cudaStatus = BitonicSortCuda(d, ELEMS);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Bitonic sort failed!");
return 1;
}
cout << "\n" << "First 1024 numbers of the first array after sorting:" << "\n";
for (int i = 0; i < 1024; i++) {
cout << d[i] << " ";
}
cudaStatus = cudaDeviceReset();// we must reset the device before closing the window
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
//printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
//c[0], c[1], c[2], c[3], c[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
delete[] d;
// we must reset the device before closing the window
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t BitonicSortCuda(int* a, std::size_t size)
{
int *dev_a = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, ELEMS * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, ELEMS * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
clock_t start, stop = 0;
start = clock();
BitonicSort(dev_a, size);
cudaStatus = cudaGetLastError();
stop = clock();
print_elapsed(start, stop);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "Bitonic sort kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Bitonic Sort!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(a, dev_a, ELEMS * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_a);// clean up the GPU memory
return cudaStatus;
}
我的视频卡是Nvidia GeForce GTX 950M,我的计算和deviceQuery示例都表明我有足够的空间来运行此实验。
在注释中可见,我试图通过在代码中调用cudaDeviceSynchronize()来解决问题,这似乎可以解决问题。但是此解决方案的副作用是,它似乎将执行时间从毫秒的某些部分乘以秒(如果不是分钟的话),并且使算法复杂度复杂化为n(log ^ 2(n)),这不是排序器的并行计算时间,这对我的分配来说是不可取的。
我已经向讲师展示了代码,他说这可能与看门狗定时器有关,还告诉我对较小的数组进行操作。但是负责实验室的老师希望我使用更大的数组,这就是为什么我决心解决这个错误。
这是负责排序实现本身的代码(CUDA中的内存在父函数中分配)。
我多次验证了我的结果,并且排序器可以很好地处理数字大小为2的幂的数组(我想做一个可变的数组大小,但被告知对于此赋值来说太复杂了) ,但是一旦达到这种大小,问题就会开始。我没有尝试在尺寸为64 * 32768 * 512的阵列上运行它,因为有人告诉我这已经超出了我的硬件限制。
但是,对我来说重要的是准确估计执行带有大数组的排序器所需的时间。我在代码中放置cudaDeviceSynchronize()的解决方案似乎对其产生了很大的影响。
编辑: 粘贴整个代码。