Question

我试图弄清楚如何使用CUDA推力中的索引数组。我的问题如下：

vector<int> index(20);
vector<float> data1(100), data2(100), result(20);
for(int i=0;i<index.size();++i)
   result.push_back(do_something(data1[index[i]],data2[index[i]]));

函数do_something（）从索引数组选择的几个大数组中获取元素。索引的大小通常远小于数据的大小，索引的元素也会被排序。

我无法弄清楚在推力方面做到这一点的最佳策略是什么。

Answer 1

我建议阅读thrust quick start guide以了解推力的基本知识，以及我将在下面使用的一些概念。

我无法弄清楚在推力方面做到这一点的最佳策略是什么。

您的示例中的大多数内容应直接映射到推力中的简单操作。您的for-loop和do_something操作将被推力algorithm取代，可能会使用适当的仿函数定义来模仿do_something中的功能。一个易于使用的推力算法thrust::transform()可能适用于这种情况。

index数组用于间接通常会使用推力permutation_iterator来处理，而推力do_something正是为此目的而设计的。

结合这些概念，并假设一个简单的例子，你的result操作将对两个输入向量中每个索引元素的平方求和，然后将该结果的平方根存储在{{1}中} vector，我们可以有一个这样的例子：

$ cat t74.cu
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/copy.h>
#include <math.h>
#include <iostream>

struct do_something
{
  template <typename T>
  __host__ __device__ T operator()(const T &i1, const T &i2){
    return sqrtf(i1*i1 + i2*i2);
  }
};


int main(){
  //pythagorean triples
  float d1[] = { 3,  5,  8,  7, 20, 12,  9, 28};
  float d2[] = { 4, 12, 15, 24, 21, 35, 40, 45};
  int i1[] = {1, 3, 5, 7};
  const size_t isize = sizeof(i1)/sizeof(i1[0]);
  const size_t dsize = sizeof(d1)/sizeof(d1[0]);
  thrust::device_vector<int> index(i1, i1+isize);
  thrust::device_vector<float> data1(d1, d1+dsize);
  thrust::device_vector<float> data2(d2, d2+dsize);
  thrust::device_vector<float> result(isize);
  thrust::transform(thrust::make_permutation_iterator(data1.begin(), index.begin()), thrust::make_permutation_iterator(data1.begin(), index.end()), thrust::make_permutation_iterator(data2.begin(), index.begin()), result.begin(), do_something());
  thrust::copy_n(result.begin(), result.size(), std::ostream_iterator<float>(std::cout, ","));
  std::cout << std::endl;
}
$ nvcc -arch=sm_30 -o t74 t74.cu
$ ./t74
13,25,37,53,
$

不需要为此用法对索引集进行排序（尽管从性能角度来看可能会有所帮助）。

CUDA推力中的索引数组

1 个答案: