Question

我将函数指针封装在结构/类中。我可以轻松地在CPU实现中使用这些功能。但是，如果我想在CUDA中使用函数指针，我必须通过CUDA指令注册这些函数。不幸的是，这里的事情变得棘手。我想要的是从包含函数指针的类创建和设备函数指针。

但是让我们从结构开始：

#ifndef TRANSFERFUNCTIONS_H_
#define TRANSFERFUNCTIONS_H_

#ifndef SWIG
#include <cmath>
#include <stdio.h>
#include <string.h>
#endif

#define PI    3.14159265358979323846f 



typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);


//////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
        __host__ __device__
#endif
inline static float
fcn_gaussian_nhood (float dist, float sigmaT) {
        return exp(-pow(dist, 2.f)/(2.f*pow(sigmaT, 2.f)));
}

#ifdef __CUDACC__
        __host__ __device__
#endif
inline static float
fcn_rad_decay (float sigma0, float T, float lambda) {
        return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}

    //////////////////////////////////////////////////////////////////////////////////////////////
#ifdef __CUDACC__
        __host__ __device__
#endif
inline static float
fcn_lrate_decay (float sigma0, float T, float lambda) {
        return sigma0*exp(-T/lambda);
}

class DistFunction;
typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu) (float, float, float);
typedef float (DistFunction::*pmDistanceFu) (float, float);
typedef float (DistFunction::*pmDecayFu) (float, float, float);


class DistFunction {
private:
        pDistanceFu hDist;
        pDecayFu hRadDecay; 
        pDecayFu hLRateDecay;

public:
        DistFunction(char *, pDistanceFu, pDecayFu, pDecayFu);
        void Assign();

        char *name;
        pDistanceFu distance;
        pDecayFu rad_decay;
        pDecayFu lrate_decay;
};

void test();

#endif /* TRANSFERFUNCTIONS_H_ */

实现：

//#include <iostream>
#include "Functions.h"
#include <iostream>
#include <thrust/extrema.h>
#include <thrust/distance.h>
#include <thrust/device_vector.h>


DistFunction::DistFunction(char *cstr, pDistanceFu dist, pDecayFu rad, pDecayFu lrate) : name(cstr), distance(dist), rad_decay(rad), lrate_decay(lrate) {
}

void DistFunction::Assign() {
        pDistanceFu hDist;
        pDecayFu hRadDecay; 
        pDecayFu hLRateDecay;

        cudaMemcpyFromSymbol(&hDist, distance, sizeof(pDistanceFu) );
        cudaMemcpyFromSymbol(&hRadDecay, rad_decay, sizeof(pDecayFu) );
        cudaMemcpyFromSymbol(&hLRateDecay, lrate_decay, sizeof(pDecayFu) );

        distance = hDist;
        rad_decay = hRadDecay;
        lrate_decay = hLRateDecay;
}

DistFunction fcn_gaussian = DistFunction(
        (char*)"gaussian",
        fcn_gaussian_nhood,
        fcn_rad_decay,
        fcn_lrate_decay
);



struct sm20lrate_decay_functor {
        float fCycle;
        float fCycles;
        DistFunction m_pfunc;

        sm20lrate_decay_functor(const DistFunction &pfunc, float cycle, float cycles) : m_pfunc(pfunc), fCycle(cycle), fCycles(cycles) {}

        __host__ __device__
        float operator()(float lrate) {
                return (m_pfunc.lrate_decay)(lrate, fCycle, fCycles);
        }
};

void test() {
        unsigned int iWidth     = 4096;
        thrust::device_vector<float> dvLearningRate(iWidth, 0.f);
        thrust::device_vector<float> dvLRate(iWidth, 0.f);

        thrust::transform( dvLRate.begin(),
                dvLRate.end(),
                dvLearningRate.begin(),
                sm20lrate_decay_functor(fcn_gaussian, 1, 100) );
}

编辑：做了一个小例子。

似乎CUDA设备函数指针没用，因为我无法动态使用它们。对于我们已经实施的内容，对我来说仍然很神秘。难道CUDA不是真的支持函数指针，而只是以类似的方式使用函数引用吗？

Answer 1

问题不够明确。我将尝试重新措辞：是否有可能从主机获取设备函数的函数指针而不使用中间全局声明的变量？

这是可能的，尽管你表达的方式并不完全。

首先，在你的代码示例中，函数被标记为内联静态，因此，如果CUDA没有看到它的地址有任何用处，那么函数很可能会被内联，并且获取指向它的指针是不可行的。 / p>

其次，您没有记录GetDistFunction（）返回的内容，因此，我们不知道它返回的是什么符号。

您正在使用的方法，cudaMemcpyFromSymbol正在返回

symbol是一个驻留在全局或常量内存空间中的变量。

函数指针符号不是变量，它是指向代码区域的指针。此外，GetDistFunction()->xxx不太可能是一个符号。

您使用的技术是执行您想要的操作的方法之一。您也可以在设备上初始化您的结构，其中获取函数指针与在主机端一样简单。这样，您的代码将变得更简单，无需调用cudaMemcpyToSymbol，也无需调用指针的全局变量。下面是一段代码说明两种方法，第二种方法是避免使用中间全局范围变量：

typedef int (*funcptr) ();

__device__ int f() { return 42 ; }

__device__ funcptr f_ptr = f ;

__global__ void kernel ( funcptr func )
{
    int k = func () ;
    printf ("%d\n", k) ;

    funcptr func2 = f ; // does not use a global-scope variable
    printf ("%d\n", func2()) ;
}


int main ()
{
    funcptr h_funcptr ;

    if (cudaSuccess != cudaMemcpyFromSymbol (&h_funcptr, f_ptr, sizeof (funcptr)))
        printf ("FAILED to get SYMBOL\n");

    kernel <<<1,1>>> (h_funcptr) ;
    if (cudaDeviceSynchronize() != cudaSuccess)
        printf ("FAILED\n");
    else
        printf ("SUCCEEDED\n");
}

最后，作为设计注释，您可能想尝试使用虚函数，并在设备上构建类的相应实例，所有这些初始化步骤都将由编译器生成，这是一个示例：

class T
{
public:
    virtual __device__ int f() const = 0 ;
} ;

class G : public T
{
public:
    virtual __device__ int f() const { return 42; }
} ;

__global__ void kernel2 ()
{
    T* t = new G() ;
    int k = t->f();
    printf ("%d\n", k) ;
}

int main ()
{
    kernel2<<<1,1>>>();
    if (cudaDeviceSynchronize() != cudaSuccess)
        printf ("FAILED\n");
    return 0 ;
}

使用原型模式或单例模式会有所帮助。

Answer 2

我终于发现在问题中发布的示例无法通过设备函数指针实现，因为函数指针既不也不能在主空间（例如构造函数）之外分配，也不能动态分配。

功能方面，CUDA函数指针的这个演示实现对应于下面的示例。

typedef int (*funcptr) ();

__device__ int f() { return 42 ; }

__device__ funcptr f_ptr = f ;

__global__ void kernel ( funcptr func )
{
    int k = func () ;
    printf ("%d\n", k) ;

    funcptr func2 = f ; // does not use a global-scope variable
    printf ("%d\n", func2()) ;
}


int main ()
{
    funcptr h_funcptr ;

    if (cudaSuccess != cudaMemcpyFromSymbol (&h_funcptr, f_ptr, sizeof (funcptr)))
        printf ("FAILED to get SYMBOL\n");

    kernel <<<1,1>>> (h_funcptr) ;
    if (cudaDeviceSynchronize() != cudaSuccess)
        printf ("FAILED\n");
    else
        printf ("SUCCEEDED\n");
}

正如人们可以清楚看到的那样，上面的例子没有灵活性，因为必须将每个函数指针分配给全局空间中的设备符号。

__device__ int f() { return 42 ; }

__global__ void kernel () {
    int k = f() ;
    printf ("%d\n", k) ;
}

int main ()
{
    kernel <<<1,1>>> () ;
    if (cudaDeviceSynchronize() != cudaSuccess)
        printf ("FAILED\n");
    else
        printf ("SUCCEEDED\n");
}

规避NVIDIA的绝对无用设备功能实现的唯一方法是不能比普通函数调用产生任何好处（因为命名原因），是使用模板。不幸的是，模板不允许运行时灵活性。然而，与CUDA设备函数指针相比，这没有任何不适应，因为它们也不允许运行时更改函数。

这是我针对上述问题的基于模板的解决方案。 看起来对CUDA设备功能指针看起来很强烈，但如果有人可以证明我错了，他可以发布一个例子..

typedef float (*pDistanceFu) (float, float); typedef float (*pDecayFu) (float, float, float); template <pDistanceFu Dist, pDecayFu Rad, pDecayFu LRate> class DistFunction { public: DistFunction() {} DistFunction(const char *cstr) : name(cstr) {}; const char *name; #ifdef __CUDACC__ __host__ __device__ #endif static float distance(float a, float b) { return Dist(a,b); }; #ifdef __CUDACC__ __host__ __device__ #endif static float rad_decay(float a, float b, float c) { return Rad(a,b,c); }; #ifdef __CUDACC__ __host__ __device__ #endif static float lrate_decay(float a, float b, float c) { return LRate(a,b,c); }; };

一个例子：

template <class F> struct functor { float fCycle; float fCycles; functor(float cycle, float cycles) : fCycle(cycle), fCycles(cycles) {} __host__ __device__ float operator()(float lrate) { return F::lrate_decay(lrate, fCycle, fCycles); } }; typedef DistFunction<fcn_gaussian_nhood,fcn_rad_decay,fcn_lrate_decay> gaussian; void test() { functor<gaussian> test(0,1); }

在CUDA中分配设备函数指针（来自主机函数指针）

2 个答案: