Question

我在一个仿函数中使用thrust :: reduce，它是thrust :: transform_reduce中的一个参数。这种情况看起来像嵌套推力算法。编译成功但运行错误：

terminate called after throwing an instance of 'thrust::system::system_error'
  what():  cudaEventSynchronize in future::wait: an illegal memory access was encountered
Aborted (core dumped)

代码如下：

#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>

#include <iostream>
#include <cmath>
#include <boost/concept_check.hpp>


struct aFuntor : public thrust::unary_function<int, int>
{
    aFuntor(int* av__, int* bv__, const int& N__) : av_(av__), bv_(bv__), N_(N__) {};

    __host__ __device__
    int operator()(const int& idx)
    {

    thrust::device_ptr<int> av_dpt = thrust::device_pointer_cast(av_);

    int res = thrust::reduce(av_dpt, av_dpt+N_);

        return res;
    }

    int* av_;
    int* bv_;
    int N_;
};


int main(void)
{
      int N = 5;
      std::vector<int> av = {0,1,3,5};
      std::vector<int> bv = {0,10,20,30};
      thrust::device_vector<int> av_d(N);
      thrust::device_vector<int> bv_d(N);
      av_d = av; bv_d = bv;

      // initial value of the reduction
      int init=0;

      // binary operations
      thrust::plus<int>        bin_op;

      int res =
      thrust::transform_reduce(thrust::counting_iterator<int>(0),
                               thrust::counting_iterator<int>(N-1),
                   aFuntor(thrust::raw_pointer_cast(av_d.data()), 
                      thrust::raw_pointer_cast(bv_d.data()),
                      N),
                init,
                bin_op);    

      std::cout << "result is: " << res << std::endl;
      return 0;
}

是否支持这种嵌套结构？或者除了必须重新设计我的算法之外没有任何办法？ AFAIK有哪些算法难以暴露并行性？

提前谢谢！

Answer 1

Thrust允许nested algorithm usage。但是，在从设备代码启动推力算法时，必须确保推力仅选择设备路径，在您的情况下，这不会发生。至少在我的系统（Ubuntu 14.04）上，当我按原样编译你的代码时，我得到了一个指示：

t113.cu(20) (col. 9): warning: calling a __host__ function("thrust::reduce< ::thrust::device_ptr<int> > ") from a __host__ __device__ function("aFuntor::operator ()") is not allowed

所以这显然不是这里想要的。相反，我们可以强制推力使用设备路径（在设备代码中 - 这实际上隐含在您的仿函数定义中，因为您正在传递设备指针），推力执行策略为thrust::device。当我进行以下更改时，您的代码会编译并运行而不会出现错误：

$ cat t113.cu
#include <thrust/inner_product.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>

#include <iostream>
#include <cmath>
#include <thrust/execution_policy.h>
//#include <boost/concept_check.hpp>


struct aFuntor : public thrust::unary_function<int, int>
{
    aFuntor(int* av__, int* bv__, const int& N__) : av_(av__), bv_(bv__), N_(N__) {};

    __host__ __device__
    int operator()(const int& idx)
    {

    thrust::device_ptr<int> av_dpt = thrust::device_pointer_cast(av_);

    int res = thrust::reduce(thrust::device, av_dpt, av_dpt+N_);

        return res;
    }

    int* av_;
    int* bv_;
    int N_;
};


int main(void)
{
      int N = 5;
      std::vector<int> av = {0,1,3,5};
      std::vector<int> bv = {0,10,20,30};
      thrust::device_vector<int> av_d(N);
      thrust::device_vector<int> bv_d(N);
      av_d = av; bv_d = bv;

      // initial value of the reduction
      int init=0;

      // binary operations
      thrust::plus<int>        bin_op;

      int res =
      thrust::transform_reduce(thrust::counting_iterator<int>(0),
                               thrust::counting_iterator<int>(N-1),
                   aFuntor(thrust::raw_pointer_cast(av_d.data()),
                      thrust::raw_pointer_cast(bv_d.data()),
                      N),
                init,
                bin_op);

      std::cout << "result is: " << res << std::endl;
      return 0;
}
$ nvcc -std=c++11 -arch=sm_61  -o t113 t113.cu
$ ./t113
result is: 36
$

我实际上并没有尝试从代码中解析你的意图，所以我不能肯定这是正确答案，但这似乎不是你要问的问题。（后来：答案似乎是正确的。你的算子只为每个元素产生9的值，而你在4个元素中减少了9个9x4 = 36）。

说了这么多，（对我而言）为什么推力选择原始路径中的主机路径并不完全清楚。如果您愿意，可以为此提交thrust issue。但是我完全有可能没有仔细考虑推力调度系统。主机代码算法调度（transform_reduce）可能有些令人困惑，因为例如，您是否正在使用主机或设备容器可能并不明显。

在推力函子中调用推力算法

1 个答案: