Question

我正在寻找CUDA上的排序算法，它可以对元素数组A（double）进行排序，并返回该数组A的键B数组。我知道Thrust库中的sort_by_key函数，但我希望我的元素数组A保持不变。我该怎么办？

我的代码是：

void sortCUDA(double V[], int P[], int N) {

        real_t *Vcpy = (double*) malloc(N*sizeof(double));
        memcpy(Vcpy,V,N*sizeof(double));

        thrust::sort_by_key(V, V + N, P);
        free(Vcpy);
}

我正在将推力算法与我在顺序cpu上的其他算法进行比较

N               mergesort       sortCUDA
113             0.000008        0.000010
226             0.000018        0.000016
452             0.000036        0.000020
905             0.000061        0.000034
1810            0.000135        0.000071
3621            0.000297        0.000156
7242            0.000917        0.000338
14484           0.001421        0.000853
28968           0.003069        0.001931
57937           0.006666        0.003939
115874          0.014435        0.008025
231749          0.031059        0.016718
463499          0.067407        0.039848
926999          0.148170        0.118003
1853998         0.329005        0.260837
3707996         0.731768        0.544357
7415992         1.638445        1.073755
14831984        3.668039        2.150179
115035495       39.276560       19.812200
230070990       87.750377       39.762915
460141980       200.940501      74.605219

推力表现还不错，但我认为如果我使用OMP可能很容易获得更好的CPU时间

我认为这是因为memcpy

解决方案：

void thrustSort(double V[], int P[], int N)
{
        thrust::device_vector<int> d_P(N);
        thrust::device_vector<double> d_V(V, V + N);
        thrust::sequence(d_P.begin(), d_P.end());

        thrust::sort_by_key(d_V.begin(), d_V.end(), d_P.begin());

        thrust::copy(d_P.begin(),d_P.end(),P);
}

其中V是我的排序值

Answer 1

您可以修改比较运算符以对键进行排序而不是值。 @Robert Crovella正确地指出无法从主机分配原始设备指针。修改后的算法如下：

struct cmp : public binary_function<int,int,bool>
{
  cmp(const double *ptr) : rawA(ptr) { }

  __host__ __device__ bool operator()(const int i, const int j) const 
  {return rawA[i] > rawA[j];}

   const double *rawA; // an array in global mem
}; 

void sortkeys(double *A, int n) {
  // move data to the gpu
  thrust::device_vector<double> devA(A, A + n);
  double *rawA = thrust::raw_pointer_cast(devA.data());

  thrust::device_vector<int> B(n);
  // initialize keys
  thrust::sequence(B.begin(), B.end());
  thrust::sort(B.begin(), B.end(), cmp(rawA));
  // B now contains the sorted keys
 }

这是arrayfire的替代方案。虽然我不确定哪一个更有效，因为arrayfire解决方案使用了两个额外的数组：

void sortkeys(double *A, int n) {
   af::array devA(n, A, af::afHost);
   af::array vals, indices;
   // sort and populate vals/indices arrays
   af::sort(vals, indices, devA);
   std::cout << devA << "\n" << indices << "\n";
}

Answer 2

这个阵列有多大？就速度而言，最有效的方法可能是在排序之前复制原始数组，如果内存可用的话。

Answer 3

基于@asm提供的答案（我无法使其工作），这段代码似乎对我有用，并且只对键进行排序。但是，我认为它仅限于键是按顺序0,1,2,3,4 ...对应于（双）值的情况。由于这是一种“索引值”排序，因此可以通过执行索引副本将其扩展到任意键序列的情况。但是，我不确定生成索引序列然后重新排列原始键的过程将比将原始值数据复制到新向量（对于任意键的情况）更快。

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>

using namespace std;

__device__  double *rawA; // an array in global mem

struct cmp : public binary_function<int, int, bool>
{
  __host__ __device__  bool operator()(const int i, const int j) const
  {return ( rawA[i] < rawA[j]);}
};

void sortkeys(double *A, int n) {
  // move data to the gpu
  thrust::device_vector<double> devA(A, A + n);
//  rawA = thrust::raw_pointer_cast(&(devA[0]));
  double *test = raw_pointer_cast(devA.data());
  cudaMemcpyToSymbol(rawA, &test, sizeof(double *));

  thrust::device_vector<int> B(n);
  // initialize keys
  thrust::sequence(B.begin(), B.end());
  thrust::sort(B.begin(), B.end(), cmp());
  // B now contains the sorted keys
  thrust::host_vector<int> hostB = B;
  for (int i=0; i<hostB.size(); i++)
    std::cout << hostB[i] << " ";
  std::cout<<std::endl;
  for (int i=0; i<hostB.size(); i++)
    std::cout << A[hostB[i]] << " ";
  std::cout<<std::endl;
 }


int main(){

  double C[] = {0.7, 0.3, 0.4, 0.2, 0.6, 1.2, -0.5, 0.5, 0.0, 10.0};
  sortkeys(C, 9);
  std::cout << std::endl;
  return 0;
}

CUDA Thrust和sort_by_key

3 个答案: