Question

我们说我有两个device_vector＆lt; byte＆gt;数组，d_keys和d_data。

例如，如果d_data是扁平的2D 3x5阵列（例如{1,2,3,4,5,6,7,8,9,8,7,6,5,4， 3}）和d_keys是一个大小为5的1D数组（例如{1,0,0,1,1}），如何进行减少，以便我最终只添加值如果相应的d_keys值为1，则按行计算（例如，结果为{10,23,14}）？

sum_rows.cu示例允许我添加d_data中的所有值，但这不太正确。

或者，我可以逐行使用zip_iterator并将d_keys与d_data一行合并，并执行transform_reduce ，仅在键值为1时才添加，但我必须遍历d_data数组。

我真正需要的是一些内置的transform_reduce_by_key功能，但肯定有办法实现它！

Answer 1

根据额外的注释，我们可以编写一个转换函数来代替整行，而不是3行，而是有数千行。基于有数千行的事实，这应该让机器非常繁忙：

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>

#define ROW   20
#define COL   10

__device__ int *vals;
__device__ int *keys;

struct test_functor
{
  const int a;

  test_functor(int _a) : a(_a) {}

  __device__
  int operator()(int& x, int& y ) {
    int temp = 0;
    for (int i = 0; i<a; i++)
      temp += vals[i + (y*a)] * keys[i];
    return temp;
    }
};

int main(){
  int *s_vals, *s_keys;
  thrust::host_vector<int> h_vals(ROW*COL);
  thrust::host_vector<int> h_keys(COL);
  thrust::sequence(h_vals.begin(), h_vals.end());
  thrust::fill(h_keys.begin(), h_keys.end(), 1);
  h_keys[0] = 0;
  thrust::device_vector<int> d_vals = h_vals;
  thrust::device_vector<int> d_keys = h_keys;
  thrust::device_vector<int> d_sums(ROW);
  thrust::fill(d_sums.begin(), d_sums.end(), 0);
  s_vals = thrust::raw_pointer_cast(&d_vals[0]);
  s_keys = thrust::raw_pointer_cast(&d_keys[0]);
  cudaMemcpyToSymbol(vals, &s_vals, sizeof(int *));
  cudaMemcpyToSymbol(keys, &s_keys, sizeof(int *));
  thrust::device_vector<int> d_idx(ROW);
  thrust::sequence(d_idx.begin(), d_idx.end());
  thrust::transform(d_sums.begin(), d_sums.end(), d_idx.begin(),  d_sums.begin(), test_functor(COL));
  thrust::host_vector<int> h_sums = d_sums;
  std::cout << "Results :" << std::endl;
  for (unsigned i = 0; i<ROW; i++)
    std::cout<<"h_sums["<<i<<"] = " << h_sums[i] << std::endl;
  return 0;
}

这种方法的缺点是，通常不会合并对vals数组的访问。然而，对于几千行，缓存可以提供显着的缓解。我们可以通过重新排序要在flattened数组中以列主格式存储的数据来解决这个问题，并在functor的循环中更改我们的索引方法，如下所示：

for (int i=0; i<a; i++)
  temp += vals[(i*ROW)+y]*keys[i];

如果愿意，您可以将ROW作为附加参数传递给仿函数。

Answer 2

以下是一些示例代码，它使用我在您的问题下面的评论中概述的方法来执行您所使用的操作。实际上我们想要使用4元组来获取你的关键值。在此复制经过适当修改的评论：

你可以创建一个拉链迭代器，它将你的3行拉到一起加上键“row”，然后将4元组传递给一个特殊的仿函数。然后你的特殊仿函数会减少3元组的数组（也使用键）并返回一个4元组的结果。推力dot product example可能会给你一些想法。

这是一种可行的方法：

#include <thrust/host_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/fill.h>
#include <thrust/tuple.h>

#define N 30  // make this evenly divisible by 3 for this example

typedef thrust::tuple<int, int, int, int>  tpl4int;
typedef thrust::host_vector<int>::iterator intiter;
typedef thrust::tuple<intiter, intiter, intiter, intiter>  tpl4intiter;
typedef thrust::zip_iterator<tpl4intiter>  int4zip;



struct r3key_unary_op : public thrust::unary_function<tpl4int, tpl4int>
{
  __host__ __device__
  tpl4int operator()(const tpl4int& x) const
  {
    tpl4int result;
    thrust::get<0>(result) = x.get<0>()*x.get<3>();
    thrust::get<1>(result) = x.get<1>()*x.get<3>();
    thrust::get<2>(result) = x.get<2>()*x.get<3>();
    thrust::get<3>(result) = 1;
    return result;
   }
};

struct r3key_binary_op : public thrust::binary_function<tpl4int, tpl4int, tpl4int>
{
  __host__ __device__
  tpl4int operator()(const tpl4int& x, const tpl4int& y) const
  {
    tpl4int result;
    thrust::get<0>(result) = x.get<0>()*x.get<3>() + y.get<0>()*y.get<3>();
    thrust::get<1>(result) = x.get<1>()*x.get<3>() + y.get<1>()*y.get<3>();
    thrust::get<2>(result) = x.get<2>()*x.get<3>() + y.get<2>()*y.get<3>();
    thrust::get<3>(result) = 1;
    return result;
  }
};


int main() {

  thrust::host_vector<int> A(N);  // values, in 3 "rows" flattened
  thrust::sequence(A.begin(), A.end());
  thrust::host_vector<int> K(N/3);   // keys in one row
  thrust::fill(K.begin(), K.end(), 1);  // set some keys to 1
  K[9] = 0;  // set some keys to zero

  int4zip first = thrust::make_zip_iterator(thrust::make_tuple(A.begin(), A.begin() + N/3, A.begin() + 2*N/3, K.begin()));
  int4zip  last = thrust::make_zip_iterator(thrust::make_tuple(A.begin() + N/3, A.begin() + 2*N/3, A.end(), K.end()));
  r3key_unary_op my_unary_op;
  r3key_binary_op my_binary_op;
  tpl4int init = my_unary_op(*first);
  // init = thrust::make_tuple((int) 0, (int) 0, (int) 0, (int) 0);
  tpl4int result = thrust::transform_reduce(first, last, my_unary_op, init, my_binary_op);
  std::cout << "row 0 = " << result.get<0>() << std::endl;
  std::cout << "row 1 = " << result.get<1>() << std::endl;
  std::cout << "row 2 = " << result.get<2>() << std::endl;
  return 0;

}

注意：

这只是使用host_vector。将其扩展为与device_vector一起使用，或将其模板化以使用int之外的其他内容应该是直截了当的。
为了完整性，我使用一元仿函数为每行的总和减少提供非零的初始值。您可能希望将init值更改为零（4元组的零）。

CUDA Thrust：基于“key”数组中的值，仅对数组中的某些值进行reduce_by_key

2 个答案: