thrust :: max_element比较cublasIsamax慢 - 更有效的实现?

时间:2015-01-13 15:47:11

标签: c++ performance cuda thrust cublas

我需要一种快速有效的实现来查找CUDA中数组中最大值的索引。此操作需要执行多次。我最初使用cublasIsamax,然而,它遗憾地返回最大绝对值的索引,这不是我想要的。相反,我使用的是thrust :: max_element,但与cublasIsamax相比速度相当慢。我用以下方式使用它:

//d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;

向量中的元素数量介于10到000和20&000之间。 thrust :: max_element和cublasIsamax之间的速度差异相当大。也许我在不知情的情况下执行了几次内存交易?

1 个答案:

答案 0 :(得分:6)

更有效的实现是在CUDA中编写自己的最大索引缩减代码。 cublasIsamax可能会在幕后使用这样的东西。

我们可以比较3种方法:

  1. thrust::max_element
  2. cublasIsamax
  3. 自定义CUDA内核
  4. 这是一个完全有效的例子:

    $ cat t665.cu
    #include <cublas_v2.h>
    #include <thrust/extrema.h>
    #include <thrust/device_ptr.h>
    #include <thrust/device_vector.h>
    #include <iostream>
    #include <stdlib.h>
    
    #define DSIZE 10000
    // nTPB should be a power-of-2
    #define nTPB 256
    #define MAX_KERNEL_BLOCKS 30
    #define MAX_BLOCKS ((DSIZE/nTPB)+1)
    #define MIN(a,b) ((a>b)?b:a)
    #define FLOAT_MIN -1.0f
    
    #include <time.h>
    #include <sys/time.h>
    
    unsigned long long dtime_usec(unsigned long long prev){
    #define USECPSEC 1000000ULL
      timeval tv1;
      gettimeofday(&tv1,0);
      return ((tv1.tv_sec * USECPSEC)+tv1.tv_usec) - prev;
    }
    
    __device__ volatile float blk_vals[MAX_BLOCKS];
    __device__ volatile int   blk_idxs[MAX_BLOCKS];
    __device__ int   blk_num = 0;
    
    template <typename T>
    __global__ void max_idx_kernel(const T *data, const int dsize, int *result){
    
      __shared__ volatile T   vals[nTPB];
      __shared__ volatile int idxs[nTPB];
      __shared__ volatile int last_block;
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      last_block = 0;
      T   my_val = FLOAT_MIN;
      int my_idx = -1;
      // sweep from global memory
      while (idx < dsize){
        if (data[idx] > my_val) {my_val = data[idx]; my_idx = idx;}
        idx += blockDim.x*gridDim.x;}
      // populate shared memory
      vals[threadIdx.x] = my_val;
      idxs[threadIdx.x] = my_idx;
      __syncthreads();
      // sweep in shared memory
      for (int i = (nTPB>>1); i > 0; i>>=1){
        if (threadIdx.x < i)
          if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
        __syncthreads();}
      // perform block-level reduction
      if (!threadIdx.x){
        blk_vals[blockIdx.x] = vals[0];
        blk_idxs[blockIdx.x] = idxs[0];
        if (atomicAdd(&blk_num, 1) == gridDim.x - 1) // then I am the last block
          last_block = 1;}
      __syncthreads();
      if (last_block){
        idx = threadIdx.x;
        my_val = FLOAT_MIN;
        my_idx = -1;
        while (idx < gridDim.x){
          if (blk_vals[idx] > my_val) {my_val = blk_vals[idx]; my_idx = blk_idxs[idx]; }
          idx += blockDim.x;}
      // populate shared memory
        vals[threadIdx.x] = my_val;
        idxs[threadIdx.x] = my_idx;
        __syncthreads();
      // sweep in shared memory
        for (int i = (nTPB>>1); i > 0; i>>=1){
          if (threadIdx.x < i)
            if (vals[threadIdx.x] < vals[threadIdx.x + i]) {vals[threadIdx.x] = vals[threadIdx.x+i]; idxs[threadIdx.x] = idxs[threadIdx.x+i]; }
          __syncthreads();}
        if (!threadIdx.x)
          *result = idxs[0];
        }
    }
    
    int main(){
    
      int nrElements = DSIZE;
      float *d_vector, *h_vector;
      h_vector = new float[DSIZE];
      for (int i = 0; i < DSIZE; i++) h_vector[i] = rand()/(float)RAND_MAX;
      h_vector[10] = 10;  // create definite max element
      cublasHandle_t my_handle;
      cublasStatus_t my_status = cublasCreate(&my_handle);
      cudaMalloc(&d_vector, DSIZE*sizeof(float));
      cudaMemcpy(d_vector, h_vector, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
      int max_index = 0;
      unsigned long long dtime = dtime_usec(0);
      //d_vector is a pointer on the device pointing to the beginning of the vector, containing nrElements floats.
      thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(d_vector);
      thrust::device_vector<float>::iterator d_it = thrust::max_element(d_ptr, d_ptr + nrElements);
      max_index = d_it - (thrust::device_vector<float>::iterator)d_ptr;
      cudaDeviceSynchronize();
      dtime = dtime_usec(dtime);
      std::cout << "thrust time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
      max_index = 0;
      dtime = dtime_usec(0);
      my_status = cublasIsamax(my_handle, DSIZE, d_vector, 1, &max_index);
      cudaDeviceSynchronize();
      dtime = dtime_usec(dtime);
      std::cout << "cublas time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
      max_index = 0;
      int *d_max_index;
      cudaMalloc(&d_max_index, sizeof(int));
      dtime = dtime_usec(0);
      max_idx_kernel<<<MIN(MAX_KERNEL_BLOCKS, ((DSIZE+nTPB-1)/nTPB)), nTPB>>>(d_vector, DSIZE, d_max_index);
      cudaMemcpy(&max_index, d_max_index, sizeof(int), cudaMemcpyDeviceToHost);
      dtime = dtime_usec(dtime);
      std::cout << "kernel time: " << dtime/(float)USECPSEC << " max index: " << max_index << std::endl;
    
    
      return 0;
    }
    $ nvcc -O3 -arch=sm_20 -o t665 t665.cu -lcublas
    $ ./t665
    thrust time: 0.00075 max index: 10
    cublas time: 6.3e-05 max index: 11
    kernel time: 2.5e-05 max index: 10
    $
    

    注意:

    1. CUBLAS返回的索引1高于其他索引,因为CUBLAS uses 1-based indexing
    2. CUBLAS might be quicker如果您使用CUBLAS_POINTER_MODE_DEVICE,但是要进行验证,您仍然需要将结果复制回主机。
    3. 带有CUBLAS_POINTER_MODE_DEVICE的CUBLAS应该是异步的,因此cudaDeviceSynchronize()对于我在此处显示的基于主机的时序是可取的。在某些情况下,推力也可以是异步的。
    4. 为了方便和CUBLAS与其他方法之间的结果比较,我使用了所有非负值来表示我的数据。如果您也使用负值,则可能需要调整FLOAT_MIN值。
    5. 如果您对性能非常陌生,可以尝试调整nTPBMAX_KERNEL_BLOCKS参数,看看是否可以在特定GPU上获得最佳性能。内核代码也可以通过不小心切换到(两个)线程块减少的最后阶段的warp-synchronous模式而在表上留下一些性能。
    6. 线程块减少内核使用块排空/最后块策略来避免额外内核启动的开销来执行最终的减少。