Question

有没有办法在仿函数中自动包装CUDA数学函数，以便可以在不必手动编写仿函数的情况下应用thrust::transform？类似于（我收集）std::function提供的功能？

thrust::placeholders似乎不喜欢数学函数。 std::function似乎无法使用。

示例代码：

#include <thrust/transform.h>
#include <thrust/device_vector.h>
#include <iostream>
#include <functional>
#include <math.h>

struct myfunc{
    __device__ 
    double operator()(double x,double y){
    return hypot(x,y);
    }
};

int main(){

    double x0[10] = {3.,0.,1.,2.,3.,4.,5.,6.,7.,8.};
    double y0[10] = {4.,0.,1.,2.,3.,4.,5.,6.,7.,8.};

    thrust::device_vector<double> x(x0,x0+10);
    thrust::device_vector<double> y(y0,y0+10);
    thrust::device_vector<double> r(10);

    for (int i=0;i<10;i++) std::cout << x0[i] <<" ";    std::cout<<std::endl;
    for (int i=0;i<10;i++) std::cout << y0[i] <<" ";    std::cout<<std::endl;

    // this works:
    thrust::transform(x.begin(),x.end(),y.begin(),r.begin(), myfunc());

    // this doesn't compile:
    using namespace thrust::placeholders;
    thrust::transform(x.begin(),x.end(),y.begin(),r.begin(), hypot(_1,_2));

    // nor does this:
    thrust::transform(x.begin(),x.end(),y.begin(),r.begin(), std::function<double(double,double)>(hypot));


    for (int i=0;i<10;i++) std::cout << r[i] <<" ";    std::cout<<std::endl;
}

Answer 1

将我的评论转换为这个答案：

正如@JaredHoberock所说，没有自动的方式来实现你想要的。总有一些句法/打字开销。

减少编写单独仿函数的开销的一种方法（就像使用auto h = []__device__(double x, double y){return hypot(x,y);}; thrust::transform(x.begin(),x.end(),y.begin(),r.begin(), h);一样）是使用lambdas。从CUDA 7.5开始，有一个experimental device lambda feature允许你进行以下操作：

nvcc --expt-extended-lambda ...

您需要添加以下nvcc编译器开关来编译它：

Wrapper

另一种方法是使用以下template<typename Sig, Sig& S> struct Wrapper; template<typename R, typename... T, R(&function)(T...)> struct Wrapper<R(T...), function> { __device__ R operator() (T&... a) { return function(a...); } };将函数转换为仿函数：

 thrust::transform(x.begin(),x.end(),y.begin(),r.begin(), Wrapper<double(double,double), hypot>());

然后你会像这样使用它：

import matplotlib 

def empty_get_cachedir(*args, **kwargs):
    return None

matplotlib.get_cachedir = empty_get_cachedir

Answer 2

正如m.s.所说，减少编写仿函数的开销的一种可能方法是使用 lambda表达式。请注意，GPU lambdas可与CUDA 8.0 RC一起使用（尽管仍处于试验阶段）。另一种可能性是使用占位符技术。下面是两个提到的案例的两个工作示例：

LAMBDA EXPRESSIONS

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>

// Available for device operations only from CUDA 8.0 (experimental stage)
// Compile with the flag --expt-extended-lambda

using namespace thrust::placeholders;

int main(void)
{
    // --- Input data 
    float a = 2.0f;
    float x[4] = { 1, 2, 3, 4 };
    float y[4] = { 1, 1, 1, 1 };

    thrust::device_vector<float> X(x, x + 4);
    thrust::device_vector<float> Y(y, y + 4);

    thrust::transform(X.begin(), 
                      X.end(),  
                      Y.begin(), 
                      Y.begin(),
                      [=] __host__ __device__ (float x, float y) { return a * x + y; }      // --- Lambda expression 
                     );        

    for (size_t i = 0; i < 4; i++) std::cout << a << " * " << x[i] << " + " << y[i] << " = " << Y[i] << std::endl;

    return 0;
}

<强> PLACEHOLDERS

#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform.h>

using namespace thrust::placeholders;

int main(void)
{
    // --- Input data 
    float a = 2.0f;
    float x[4] = { 1, 2, 3, 4 };
    float y[4] = { 1, 1, 1, 1 };

    thrust::host_vector<float> X(x, x + 4);
    thrust::host_vector<float> Y(y, y + 4);

    thrust::transform(X.begin(), X.end(),
                      Y.begin(),         
                      Y.begin(),
                      a * _1 + _2);

    for (size_t i = 0; i < 4; i++) std::cout << a << " * " << x[i] << " + " << y[i] << " = " << Y[i] << std::endl;

    return 0;
}

CUDA推力快捷数学函数

2 个答案: