我正在尝试在CUDA中实现Bitonic Sorter,并根据分配的数组大小来衡量时间,但是每当我尝试在32 * 32768 * 512(大小为32 * 32768,每个中有512个线程),它将返回错误719。

这是我根据最小,可验证和完整准则更改的程序的简化代码。我怀疑它可能会进一步简化,但是我不确定该怎么做。我保留了默认VSNvidia CUDA项目模板中的内容,因为我认为它对于处理错误很有用。

    #include "cuda_runtime.h"
    #include "device_launch_parameters.h"
    #include <thrust/remove.h>
    #include <thrust/merge.h>
    #include <cstdlib>
    #include <cstdio>
    #include <thrust\partition.h>
    #include <thrust\execution_policy.h>
    #include <time.h>
    using namespace std;

    cudaError_t BitonicSortCuda(int* a, std::size_t size);

    #define BLOCKS 32*32768LL//32768LL*32LL
    #define THREADS 512LL

    int random(int min, int max) //range : [min, max)
        static bool first = true;
        if (first)
            srand(time(NULL)); //seeding for the first time only
            first = false;
        return min + rand() % ((max + 1) - min);

    __global__ void BitonicStepKernel(int *arr1, int j, int k) //parallel part for swapping
        std::size_t i, ixj;

        i = threadIdx.x + blockDim.x * blockIdx.x;//calculate index of thread in the memory
        ixj = i ^ j;//multiply by element number
        if ((ixj) > i) {

            if ((i&k) == 0) {// we form  bitonic sequence using the switches.
                if (arr1[i] > arr1[ixj]) {
                    int temp = arr1[i];
                    arr1[i] = arr1[ixj];
                    arr1[ixj] = temp;

            if ((i&k) != 0) {
                if (arr1[i] < arr1[ixj]) {
                    int temp = arr1[i];
                    arr1[i] = arr1[ixj];
                    arr1[ixj] = temp;



    void BitonicSort(int *arr1, std::size_t size)// main bitonic sort function
        dim3 blockdim(BLOCKS, 1);
        dim3 threaddim(THREADS, 1);
        int j, k;
        for (k = 2; k <= ELEMS; k <<= 1) { //split into k partitions
            for (j = k >> 1; j > 0; j = j >> 1) {
                BitonicStepKernel << <blockdim, threaddim >> > (arr1, j, k);
                //cout << (cudaGetErrorString(cudaGetLastError()));
                 //cudaDeviceSynchronize();// wait for all the parallel parts to finish to avoid errors

    void print_elapsed(clock_t start, clock_t stop)// printing the time

        double elapsed = ((double)(stop - start)) / CLOCKS_PER_SEC;
        printf("\nElapsed time: %.3fs\n", elapsed);

    int main()
        //const std::size_t num = 512 * BLOCKS;
        // Next, we initialize 3 arrays and print the first 10 numbers from them
        int* d = new int[ELEMS];
        for (std::size_t i = 0; i < ELEMS; i++) {
            d[i] = (int)random(0, 100);
        cout << "\n" << "First 1024 numbers of the first array:" << "\n";
        for (int i = 0; i < 1024; i++) {
            cout << d[i] << " ";
        // Then, we call the sorting function for all 3 of them
        cudaError_t cudaStatus = BitonicSortCuda(d, ELEMS);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "Bitonic sort failed!");
            return 1;
        cout << "\n" << "First 1024 numbers of the first array after sorting:" << "\n";
        for (int i = 0; i < 1024; i++) {
            cout << d[i] << " ";
        cudaStatus = cudaDeviceReset();// we must reset the device before closing the window
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceReset failed!");
            return 1;
        //printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
            //c[0], c[1], c[2], c[3], c[4]);

        // cudaDeviceReset must be called before exiting in order for profiling and
        // tracing tools such as Nsight and Visual Profiler to show complete traces.
        delete[] d;
        // we must reset the device before closing the window

        return 0;

    // Helper function for using CUDA to add vectors in parallel.
    cudaError_t BitonicSortCuda(int* a, std::size_t size)
        int *dev_a = 0;
        cudaError_t cudaStatus;

        // Choose which GPU to run on, change this on a multi-GPU system.
        cudaStatus = cudaSetDevice(0);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
            goto Error;

        cudaStatus = cudaMalloc((void**)&dev_a, ELEMS * sizeof(int));
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMalloc failed!");
            goto Error;
        // Copy input vectors from host memory to GPU buffers.
        cudaStatus = cudaMemcpy(dev_a, a, ELEMS * sizeof(int), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        clock_t start, stop = 0;
        start = clock();
        BitonicSort(dev_a, size);
        cudaStatus = cudaGetLastError();
        stop = clock();
        print_elapsed(start, stop);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "Bitonic sort kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
            goto Error;

        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Bitonic Sort!\n", cudaStatus);
            goto Error;

        // Copy output vector from GPU buffer to host memory.
        cudaStatus = cudaMemcpy(a, dev_a, ELEMS * sizeof(int), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;

        cudaFree(dev_a);// clean up the GPU memory

        return cudaStatus;

我的视频卡是Nvidia GeForce GTX 950M,我的计算和deviceQuery示例都表明我有足够的空间来运行此实验。

在注释中可见,我试图通过在代码中调用cudaDeviceSynchronize()来解决问题,这似乎可以解决问题。但是此解决方案的副作用是,它似乎将执行时间从毫秒的某些部分乘以秒(如果不是分钟的话),并且使算法复杂度复杂化为n(log ^ 2(n)),这不是排序器的并行计算时间,这对我的分配来说是不可取的。



我多次验证了我的结果,并且排序器可以很好地处理数字大小为2的幂的数组(我想做一个可变的数组大小,但被告知对于此赋值来说太复杂了) ,但是一旦达到这种大小,问题就会开始。我没有尝试在尺寸为64 * 32768 * 512的阵列上运行它,因为有人告诉我这已经超出了我的硬件限制。


编辑: 粘贴整个代码。

