Question

我正在开发一个CUDA应用程序，它要求我将一些任意函数传递给CUDA内核。因为为每个可能的情况声明一个函数指针并将它们传递给内核会有太多的麻烦（> 50个不同的函数），并且它们都是像sin(x)/y这样的基本函数的组合，我想要为CUDA内核提供一些最小的Lambda-Expression功能。由于设备代码不支持C ++ 11功能（据我所知），并且我在网上找不到任何相关信息，我决定自学表达模板并实现一些简单的lambda表达式规则以传入内核

我已经提出了以下代码，这是一种在NVCC上编译并运行良好的最小实现。但是，沿着这条路走下去，我只能实现带有1个变量的函数。有没有办法扩展我的代码来处理像sin(_x) + _y这样的函数组合？

提前致谢！

#include<math.h>

#ifdef __CUDACC__
#define HOST_DEVICE __host__ __device__
#else
#define HOST_DEVICE
#endif

struct Id {};

template <typename Op, typename Left, typename Right>
struct BinaryOp
{
    Left left;
    Right right;
    HOST_DEVICE BinaryOp(Left t1, Right t2) : left(t1), right(t2) {}

    HOST_DEVICE double operator() (double x) {
        return Op::apply(left(x), right(x));
    }
};

template <typename Op, typename Arg>
struct UnaryOp
{
    Arg arg;
    HOST_DEVICE UnaryOp(Arg t1) : arg(t1) {}

    HOST_DEVICE double operator() (double x) {
        return Op::apply(arg(x));
    }
};

template <>
struct UnaryOp<Id, double>
{
    HOST_DEVICE UnaryOp() {}
    HOST_DEVICE double operator() (double x) {
        return x;
    }
};

struct Sin
{
    HOST_DEVICE static double apply(double x) {
        return sin(x);
    }
};

struct Plus
{
    HOST_DEVICE static double apply(double a, double b) {
        return a + b;
    }
};

template <typename Left, typename Right>
BinaryOp<Plus, Left, Right> operator+ (Left lhs, Right rhs) {
    return BinaryOp<Plus, Left, Right>(lhs, rhs);
}

template <typename Arg>
UnaryOp<Sin, Arg> _sin(Arg arg) {
    return UnaryOp<Sin, Arg>(arg);
}

template <class T>
__global__ void test(T func, double x) {
    printf("%e\n", func(x));
}

int main () 
{
    UnaryOp<Id, double> _x;
    double x = 1.0;
    test<<<1, 1>>>(_sin(_x) + _x, x);
    cudaDeviceSynchronize();  // Needed or the host will return before kernel is finished
    return 0;
}

Answer 1

所以我在问了这个问题之后花了一些时间并且破解了一个解决方案。这很丑，但它适合我自己。以下是修改后的代码，最多支持3个自由变量。更多变量可以硬编码，但我现在不需要我的项目。

#include<math.h>

#ifdef __CUDACC__
#define HOST_DEVICE __host__ __device__
#else
#define HOST_DEVICE
#endif

struct Id {};

template <typename Op, typename Left, typename Right>
struct BinaryOp
{
    Left left;
    Right right;
    HOST_DEVICE BinaryOp(Left t1, Right t2) : left(t1), right(t2) {}

    HOST_DEVICE double operator() (double x1, double x2 = 0.0, double x3 = 0.0) {
        return Op::apply(left(x1, x2, x3), right(x1, x2, x3));
    }
};

template <typename Op, typename Arg>
struct UnaryOp
{
    Arg arg;
    HOST_DEVICE UnaryOp(Arg t1) : arg(t1) {}

    HOST_DEVICE double operator() (double x1, double x2 = 0.0, double x3 = 0.0) {
        return Op::apply(arg(x1, x2, x3));
    }
};

template <int argnum>
struct Var
{
    HOST_DEVICE Var() {}
    HOST_DEVICE double operator() (double x1, double x2 = 0.0, double x3 = 0.0) {
        if (1 == argnum) return x1;
        else if (2 == argnum) return x2;
        else return x3;
    }
};

struct Sin
{
    HOST_DEVICE static double apply(double x) {
        return sin(x);
    }
};

struct Plus
{
    HOST_DEVICE static double apply(double a, double b) {
        return a + b;
    }
};

template <typename Left, typename Right>
BinaryOp<Plus, Left, Right> operator+ (Left lhs, Right rhs) {
    return BinaryOp<Plus, Left, Right>(lhs, rhs);
}

template <typename Arg>
UnaryOp<Sin, Arg> _sin(Arg arg) {
    return UnaryOp<Sin, Arg>(arg);
}

template <class T>
__global__ void test(T func, double x, double y, double z = 0.0) {
    printf("%e\n", func(x, y));
}

Var<1> _x;
Var<2> _y;

int main () 
{
    test<<<1, 1>>>(_sin(_x) + _y, 1.0, 2.0);
    cudaDeviceSynchronize();  // Needed or the host will return before kernel is finished
    return 0;
}

这显然是一个丑陋的黑客。 lambda表达式仅适用于double（或可转换为double的类型）。但是我无法想象现在可以解决这个问题。希望NVCC能够尽快支持c ++ 11功能，这样我就不再需要这种黑客了。

如果有人能够向我展示更好的解决方案，无论是图书馆还是更好的黑客攻击方式，我们将不胜感激。谢谢你的帮助！

使用Cuda中的表达式模板构建lambda表达式

1 个答案: