cub BlockRadixSort:如何处理大的瓷砖尺寸或排序多个瓷砖?

时间:2014-03-03 04:06:34

标签: sorting cuda cub

当使用cub :: BlockRadixSort在块内进行排序时,如果元素的数量太大,我们该如何处理?如果我们将磁贴大小设置得太大,临时存储的共享内存很快就无法保存。如果我们将它拆分为多个图块,我们如何在对每个图块进行排序后对其进行后处理?

1 个答案:

答案 0 :(得分:3)

  • 警告:我不是幼崽专家(远离它)。
  • 您可能希望查看此question/answer,因为我正在基于我在那里所做的一些工作。
  • 当然,如果问题规模足够大,那么device-wide sort似乎是您可能想要考虑的问题。但是你的问题似乎集中在块排序上。

根据我的测试,cub并不真正了解原始数据的位置或放置临时存储的位置。因此,一种可能的解决方案就是将临时存储放在全局内存中。为了分析这个,我创建了一个包含3个不同测试用例的代码:

  1. 使用全局内存中的临时存储测试一个cub块排序版本。
  2. 测试从示例here
  3. 改编的cub块排序的原始版本
  4. 测试从我之前的答案派生的cub块排序版本,其中没有数据复制到全局内存或从全局内存复制,即。假设数据已经“驻留在芯片上”,即在共享存储器中。
  5. 这些都没有经过广泛的测试,但由于我正在构建Cub构建块,并在前两种情况下测试我的结果,所以希望我没有犯过任何严重的错误。这是完整的测试代码,我将在下面做出更多评论:

    $ cat t10.cu
    #include <cub/cub.cuh>
    #include <stdio.h>
    #include <stdlib.h>
    #include <thrust/sort.h>
    #define nTPB 512
    #define ELEMS_PER_THREAD 2
    #define RANGE (nTPB*ELEMS_PER_THREAD)
    #define DSIZE (nTPB*ELEMS_PER_THREAD)
    
    
    
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
    
    using namespace cub;
    // GLOBAL CUB BLOCK SORT KERNEL
    // Specialize BlockRadixSort collective types
    typedef BlockRadixSort<int, nTPB, ELEMS_PER_THREAD> my_block_sort;
    __device__ int my_val[DSIZE];
    __device__ typename my_block_sort::TempStorage sort_temp_stg;
    
    // Block-sorting CUDA kernel (nTPB threads each owning ELEMS_PER THREAD integers)
    __global__ void global_BlockSortKernel()
    {
        // Collectively sort the keys
        my_block_sort(sort_temp_stg).Sort(*static_cast<int(*)[ELEMS_PER_THREAD]>(static_cast<void*>(my_val+(threadIdx.x*ELEMS_PER_THREAD))));
    
    }
    
    // ORIGINAL CUB BLOCK SORT KERNEL
    template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
    __global__ void BlockSortKernel(int *d_in, int *d_out)
    {
    // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types
      typedef cub::BlockLoad<int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_TRANSPOSE> BlockLoadT;
      typedef cub::BlockStore<int*, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_TRANSPOSE> BlockStoreT;
      typedef cub::BlockRadixSort<int, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
    // Allocate type-safe, repurposable shared memory for collectives
      __shared__ union {
        typename BlockLoadT::TempStorage load;
        typename BlockStoreT::TempStorage store;
        typename BlockRadixSortT::TempStorage sort;
        } temp_storage;
    // Obtain this block's segment of consecutive keys (blocked across threads)
      int thread_keys[ITEMS_PER_THREAD];
      int block_offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD);
      BlockLoadT(temp_storage.load).Load(d_in + block_offset, thread_keys);
      __syncthreads(); // Barrier for smem reuse
    // Collectively sort the keys
      BlockRadixSortT(temp_storage.sort).Sort(thread_keys);
      __syncthreads(); // Barrier for smem reuse
    // Store the sorted segment
      BlockStoreT(temp_storage.store).Store(d_out + block_offset, thread_keys);
    }
    
    
    
    // SHARED MEM CUB BLOCK SORT KERNEL
    // Block-sorting CUDA kernel (nTPB threads each owning ELEMS_PER THREAD integers)
    template <int BLOCK_THREADS, int ITEMS_PER_THREAD>
    __global__ void shared_BlockSortKernel(int *d_out)
    {
        __shared__ int my_val[BLOCK_THREADS*ITEMS_PER_THREAD];
        // Specialize BlockRadixSort collective types
        typedef BlockRadixSort<int, BLOCK_THREADS, ITEMS_PER_THREAD> my_block_sort;
        // Allocate shared memory for collectives
        __shared__ typename my_block_sort::TempStorage sort_temp_stg;
    
        // need to extend synthetic data for ELEMS_PER_THREAD > 1
        my_val[threadIdx.x*ITEMS_PER_THREAD]  = (threadIdx.x + 5); // synth data
        my_val[threadIdx.x*ITEMS_PER_THREAD+1]  = (threadIdx.x + BLOCK_THREADS + 5); // synth data
        __syncthreads();
    //    printf("thread %d data = %d\n", threadIdx.x,  my_val[threadIdx.x*ITEMS_PER_THREAD]);
    
        // Collectively sort the keys
        my_block_sort(sort_temp_stg).Sort(*static_cast<int(*)[ITEMS_PER_THREAD]>(static_cast<void*>(my_val+(threadIdx.x*ITEMS_PER_THREAD))));
        __syncthreads();
    
    //    printf("thread %d sorted data = %d\n", threadIdx.x,  my_val[threadIdx.x*ITEMS_PER_THREAD]);
        if (threadIdx.x == clock()){ // dummy to prevent compiler optimization
          d_out[threadIdx.x*ITEMS_PER_THREAD] = my_val[threadIdx.x*ITEMS_PER_THREAD];
          d_out[threadIdx.x*ITEMS_PER_THREAD+1] = my_val[threadIdx.x*ITEMS_PER_THREAD+1];}
    }
    
    
    int main(){
        int *h_data, *h_result;
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        h_data=(int *)malloc(DSIZE*sizeof(int));
        h_result=(int *)malloc(DSIZE*sizeof(int));
        if (h_data == 0) {printf("malloc fail\n"); return 1;}
        if (h_result == 0) {printf("malloc fail\n"); return 1;}
        for (int i = 0 ; i < DSIZE; i++) h_data[i] = rand()%RANGE;
        // first test sorting directly out of global memory
        global_BlockSortKernel<<<1,nTPB>>>(); //warm up run
        cudaDeviceSynchronize();
        cudaMemcpyToSymbol(my_val, h_data, DSIZE*sizeof(int));
        cudaCheckErrors("memcpy to symbol fail");
        cudaEventRecord(start);
        global_BlockSortKernel<<<1,nTPB>>>(); //timing run
        cudaEventRecord(stop);
        cudaDeviceSynchronize();
        cudaCheckErrors("cub 1 fail");
        cudaEventSynchronize(stop);
        float et;
        cudaEventElapsedTime(&et, start, stop);
        cudaMemcpyFromSymbol(h_result, my_val, DSIZE*sizeof(int));
        cudaCheckErrors("memcpy from symbol fail");
        if(!thrust::is_sorted(h_result, h_result+DSIZE)) { printf("sort 1 fail!\n"); return 1;}
        printf("global Elapsed time: %fms\n", et);
        printf("global Kkeys/s: %d\n", (int)(DSIZE/et));
        // now test original CUB block sort copying global to shared
        int *d_in, *d_out;
        cudaMalloc((void **)&d_in, DSIZE*sizeof(int));
        cudaMalloc((void **)&d_out, DSIZE*sizeof(int));
        cudaCheckErrors("cudaMalloc fail");
        BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_in, d_out); // warm up run
        cudaMemcpy(d_in, h_data, DSIZE*sizeof(int), cudaMemcpyHostToDevice);
        cudaEventRecord(start);
        BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_in, d_out); // timing run
        cudaEventRecord(stop);
        cudaDeviceSynchronize();
        cudaCheckErrors("cub 2 fail");
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&et, start, stop);
        cudaMemcpy(h_result, d_out, DSIZE*sizeof(int), cudaMemcpyDeviceToHost);
        cudaCheckErrors("cudaMemcpy D to H fail");
        if(!thrust::is_sorted(h_result, h_result+DSIZE)) { printf("sort 2 fail!\n"); return 1;}
        printf("CUB Elapsed time: %fms\n", et);
        printf("CUB Kkeys/s: %d\n", (int)(DSIZE/et));
        // now test shared memory-only version of block sort
        shared_BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_out); // warm-up run
        cudaEventRecord(start);
        shared_BlockSortKernel<nTPB, ELEMS_PER_THREAD><<<1, nTPB>>>(d_out); // timing run
        cudaEventRecord(stop);
        cudaDeviceSynchronize();
        cudaCheckErrors("cub 3 fail");
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&et, start, stop);
        printf("shared Elapsed time: %fms\n", et);
        printf("shared Kkeys/s: %d\n", (int)(DSIZE/et));
        return 0;
    }
    $ nvcc -O3 -arch=sm_20 -o t10 t10.cu
    $ ./t10
    global Elapsed time: 0.236960ms
    global Kkeys/s: 4321
    CUB Elapsed time: 0.042816ms
    CUB Kkeys/s: 23916
    shared Elapsed time: 0.040192ms
    shared Kkeys/s: 25477
    $
    

    对于这个测试,我使用的是CUDA 6.0RC,cub v1.2.0(这是最近的版本),RHEL5.5 / gcc4.1.2和Quadro5000 GPU(cc2.0,11SMs,比a慢了约40%) GTX480)。以下是我发现的一些观察结果:

    1. 原始cub排序(2)与全局内存排序(1)的速度比约为6:1,大约是共享内存(~1TB / s)与全局内存的带宽比(~150GB / S)。
    2. 原始的cub排序(2)具有吞吐量,当按SM(11)的数量进行缩放时,产生263MKeys / s,是我在此设备上看到的最佳设备范围排序的相当大一部分({ {3}},屈服~480MKeys / s)
    3. 仅共享内存排序并不比从全局内存复制输入/输出的原始cub排序快,表明从全局内存到cub临时存储的副本不是整个处理时间的很大一部分
    4. 6:1的罚款是一个很大的罚款。所以我的建议是,如果可能的话,对问题大小使用设备范围的排序,大于可以通过Cub块排序轻松处理的问题。这使您可以利用一些最佳GPU代码编写器的专业知识进行排序,并实现更接近整个设备能力的吞吐量。

      请注意,我可以在类似条件下进行测试,此处的问题大小(512个线程,每个线程2个元素)不会超过您在CUB块排序中可以执行的操作。但是,将数据集大小扩展为更大的值(例如,每个线程1024个元素)并不难,只能使用第一种方法处理(在这种情况下,在这些选择中)。如果我做了更大的问题大小,在我的GPU上,我观察到我的cc2.0设备上的全局内存块排序的吞吐量约为6Mkeys / s。