Question

我正在寻找一种类似于快速选择的算法，它不会改变它的输入。我更喜欢将数据设置为只读，因此在CUDA GPU上执行会更节省内存。

Answer 1

据推测，你想要避免在GPU上分配临时内存，大多数明智的排序和/或选择算法都会根据你的语句＆＃34进行分配;所以它会更多的内存高效地在CUDA GPU上执行＆＃34;。

首先我想说，我认为这可能是一个坏主意，因为尽管你可能已经满足了对内存效率的想法，但你几乎肯定会削弱任何算法的性能。。给予排序或选择算法临时空间以跟踪其已完成的工作可能会产生更快的代码，但代价是上述临时内存分配。

根据该前言，这里有一种可能的方法：一种强制算法，它对只读数组中的元素进行排序，直到找到满足给定选定项的元素为止。基本思想非常简单：对于只读输入数组中的每个项目，计算整个数组中位于其上方的项目数（对于降序排序）。重复需要特殊的案件处理，但我已经包括了。找到所选项目后，可以取消搜索。

这是一个完整的例子，结果验证：

$ cat t729.cu
#include <stdio.h>
#include <stdlib.h>

// for validation
#include <algorithm>

#define DSIZE 10000
#define RG    1000
#define SELECT_ITEM 100

#define nBLK  64
#define nTPB  256

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)



#define DESCEND true
#define ASCEND  false



template <bool descending, typename T>
__global__ void my_select(const T *data, const unsigned length, const unsigned select, volatile int *index){

  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  while ((*index == -1) && (idx < length)){
    T my_val = data[idx];
    unsigned my_index = 0;
    // count the number of values higher (or lower) than me
    for (unsigned i = 0; i < length; i++){
        T temp = data[i];
        if (descending){ // request to select item in descending order
          if (temp > my_val) my_index++;
          else if ((temp == my_val) && (i < idx)) my_index++; // handle duplicates
          }
        else {  // request to select item in ascending order
          if (temp < my_val) my_index++;
          else if ((temp == my_val) && (i < idx)) my_index++; // handle duplicates
          }}
    if (my_index == select) *index = idx;
    idx += blockDim.x*gridDim.x;
    }
}

int main(){

  int *h_data, *d_data, *d_result, h_result = -1;
  h_data = (int *)malloc(DSIZE*sizeof(int));
  if (h_data == NULL) {printf("malloc fail\n"); return -1;}
  cudaMalloc(&d_data, DSIZE*sizeof(int));
  cudaMalloc(&d_result, DSIZE*sizeof(int));
  cudaCheckErrors("cudaMalloc fail");
  for (int i =0; i < DSIZE; i++) h_data[i] = rand()%RG;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_result, &h_result, sizeof(int),   cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy fail");
  my_select<DESCEND><<<nBLK, nTPB>>>(d_data, DSIZE, SELECT_ITEM, d_result);
  cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy/kernel fail");
  printf("the item index at select position %d is %d, item value = %d\n", SELECT_ITEM, h_result, h_data[h_result]);
  // validation
  std::sort(h_data, h_data+DSIZE);
  printf("the item value at that sorted position is                %d\n", h_data[(DSIZE-1) - SELECT_ITEM]); // assumes DESCEND

  return 0;
}
$ nvcc -o t729 t729.cu
$ ./t729
the item index at select position 100 is 2858, item value = 990
the item value at that sorted position is                990
$

在我看来，更好的方法是复制输入数据，并将该副本传递给您选择的GPU优化选择算法，例如this one并让该算法使用任何临时空间它需要。这肯定会比上述方法更快，并且可能比你可能想出的任何不使用临时空间的任何东西都快。

快速选择算法处理只读输入以在CUDA上运行

1 个答案: