Question

我想在CPU上动态创建一个函数指针列表（使用从MyForeignKeyId调用的某种push_back()方法）并将其复制到GPU main()或{{ 1}}数组，无需求助于静态__constant__函数指针。我相信this question与我的问题有关;但是，我的目标是迭代地创建__device__函数指针数组，然后将其复制到__device__函数指针数组，而不是在声明时初始化后者。

带有静态函数指针的工作代码示例（如here或here所示）将是：

COMMON.H：

__host__

main.cu：

__constant__

规格：GeForce GTX 670，为#ifndef COMMON_H #define COMMON_H #include <stdio.h> #include <iostream> #define num_functions 3 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } // fptr_t: Pointer to void function that takes two integer lvalues typedef void (*fptr_t)(int&, int&); // some examples of void(int&, int&) functions... __device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);} __device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);} __device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);} // List of function pointers in device memory __constant__ fptr_t constant_fList[num_functions]; // Kernel called from main(): choose the function to apply whose index is equal to thread ID __global__ void kernel(int a, int b) { fptr_t f; if (threadIdx.x < num_functions) { f = constant_fList[threadIdx.x]; f(a,b); } } #endif，CUDA 6.5，Ubuntu 14.04编译

我希望避免使用静态设备函数指针，因为追加每个函数都需要在用户端进行代码维护 - 声明一个新的静态指针，如#include "common.h" // Static device function pointers __device__ fptr_t p_Add = Add; __device__ fptr_t p_Sub = Subtract; __device__ fptr_t p_Mul = Multiply; // Load function list to constant memory void loadList_staticpointers() { fptr_t h_fList[num_functions]; gpuErrchk( cudaMemcpyFromSymbol(&h_fList[0], p_Add, sizeof(fptr_t)) ); gpuErrchk( cudaMemcpyFromSymbol(&h_fList[1], p_Sub, sizeof(fptr_t)) ); gpuErrchk( cudaMemcpyFromSymbol(&h_fList[2], p_Mul, sizeof(fptr_t)) ); gpuErrchk( cudaMemcpyToSymbol(constant_fList, h_fList, num_functions * sizeof(fptr_t)) ); } int main() { loadList_staticpointers(); int a = 12, b = 15; kernel<<<1,3>>>(a, b); gpuErrchk(cudaGetLastError()); gpuErrchk(cudaDeviceSynchronize()); return 0; }或-arch=sm_30，操纵{ {1}}等。为了说清楚，我正在尝试类似以下（崩溃）代码：

main_wrong.cu：

p_Add

我的理解是指向主机地址的函数指针被复制到GPU并且内核无法使用，当函数{需要指向GPU地址的指针时{1}}被调用。使用设备端指针填充主机端数组对我来说可以使用原始数据（请参阅this question），但不能使用函数指针。使用统一内存的琐碎尝试也失败了......到目前为止，我只发现了静态设备端指针。没有其他方法可以将动态创建的CPU函数指针数组复制到GPU上吗？

Answer 1

如果您可以使用C ++ 11（从CUDA 7开始支持），您可以使用以下命令自动生成函数表：

template <fptr_t... Functions>
__global__ void kernel(int a, int b)
{
  constexpr auto num_f = sizeof...(Functions);

  constexpr fptr_t table[] = { Functions... };

  if (threadIdx.x < num_f)
  {
    fptr_t f = table[threadIdx.x];
    f(a,b);
  }
}

然后您将使用

调用此内核

kernel<Add, Subtract, Multiply><<<1,3>>>(a, b);

Answer 2

受 ms 的回答启发，我选择将函数指针作为模板参数传递 - 这实际上是解决我的问题的关键 - 并发现了在没有静态函数指针的帮助的情况下，迭代地从__device__函数中填充dev_fList函数指针main()数组确实是可能的，加上C ++ 11的兼容性是甚至不需要！

以下是全局内存中__device__数组的工作示例。我还没有尝试过它的常量内存对应物，但是一旦全局存储器阵列被令人满意地创建，我的猜测是cudaMemcpyToSymbol(..., cudaMemcpyDeviceToDevice)应该可以做到这一点。

内核kernel()为函数指针dev_f创建GPU地址，并复制作为模板参数传递的函数f。由于这是来自CPU的迭代过程，因此该内核中只涉及一个线程（线程0），该线程使用配置<<<1,1>>>启动。静态变量count_f负责dev_fList中的索引。

COMMON.H：

#ifndef COMMON_H #define COMMON_H #include <stdio.h> #include <iostream> #define num_functions 3 #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } // fptr_t: Pointer to void function that takes two integer lvalues typedef void (*fptr_t)(int&, int&); // some examples of void(int&, int&) functions... __device__ void Add(int &a, int &b) {printf("Add... %i + %i = %i\n", a, b, a+b);} __device__ void Subtract(int &a, int &b) {printf("Subtract... %i - %i = %i\n", a, b, a-b);} __device__ void Multiply(int &a, int &b) {printf("Multiply... %i * %i = %i\n", a, b, a*b);} // List of function pointers in device memory // Note that, in my example, it resides in global memory space, not constant memory __device__ fptr_t dev_fList[num_functions]; #endif

main.cu：

#include "common.h" // Index in dev_fList[] == number of times addFunc<>() was launched static int count_f = 0; // Kernel that copies function f to the GPU template<fptr_t f> __global__ void kernel(int a, int b, int idx) { fptr_t dev_f = f; // Create device function pointer dev_fList[idx] = dev_f; // Populate the GPU array of function pointers dev_fList[idx](a,b); // Make sure that the array was populated correctly } // Add function to functions list template<fptr_t f> void addFunc(const int &a, const int &b) { if (count_f >= num_functions) { std::cout << "Error: not enough memory statically allocated on device!\n"; exit(EXIT_FAILURE); } kernel<f><<<1,1>>>(a,b,count_f); gpuErrchk(cudaGetLastError()); gpuErrchk(cudaDeviceSynchronize()); count_f++; } int main() { int a = 12, b = 15; addFunc<Add>(a,b); addFunc<Subtract>(a,b); addFunc<Multiply>(a,b); return 0; }

编辑：添加了函数指针数组的副本到常量内存

对于它的价值，这里是如何将我们的dev_fList数组复制到常量内存：

在common.h中：

__constant__ fptr_t cst_fList[num_functions]; __global__ void cst_test(int a, int b, int idx) { if (threadIdx.x < idx) cst_fList[threadIdx.x](a,b); }

在main.cu main()函数中，添加了所有需要的函数后：

fptr_t *temp; gpuErrchk( cudaMemcpyFromSymbol((void**)&temp, dev_fList[0], count_f * sizeof(fptr_t)) ); gpuErrchk( cudaMemcpyToSymbol(cst_fList[0], &temp, count_f * sizeof(fptr_t)) ); cst_test<<<1,count_f>>>(a,b, count_f); gpuErrchk(cudaGetLastError()); gpuErrchk(cudaDeviceSynchronize());

据我所知，内存可能会通过temp传输到主机然后再返回设备，这看起来很丑陋;欢迎提出更优雅的建议。

Answer 3

不可能使用动态创建的CUDA设备函数指针（至少不会没有崩溃或UB）。基于模板的解决方案在编译时工作（非动态）。 CUDA设备函数指针接近您，看到全局空间中的任何地方都需要设备符号。这意味着对于每个函数，必须已经声明了设备函数指针。这也意味着您不能使用普通的C函数指针作为参考，例如，在运行时设置。在理解中，使用CUDA设备函数指针是有问题的。基于模板的方法看起来用户友好，但根据定义不是动态的。

显示带有函数指针的结构的示例：

此示例显示了具有一些函数指针的结构。在普通的C ++代码中，您可以在程序运行时（动态）设置和更改设备函数指针。使用CUDA，下面的示例不可能，因为结构中的函数指针不是有效的设备符号。这意味着它们不能与“cudaMemcpyFromSymbol”一起使用。为了避免这种情况，必须创建原始函数（函数指针的目标）或全局cuda设备函数指针。两者都不是动态的。

这是动态分配：

typedef float (*pDistanceFu) (float, float);
typedef float (*pDecayFu)    (float, float, float);

// In C++ you can set and reset the function pointer during run time whenever you want ..
struct DistFunction {
  /*__host__ __device__*/ pDistanceFu distance; // uncomment for NVCC ..
  /*__host__ __device__*/ pDecayFu rad_decay;
  /*__host__ __device__*/ pDecayFu lrate_decay;
};

// you can do what you want ..
DistFunction foo, bar;
foo.distance = bar.distance;
// ..

这是CUDA应该如何，但它会失败，因为没有有效的设备符号:(

pDistanceFu hDistance; 
pDecayFu hRadDay; 
pDecayFu hLRateDecay; 

void DeviceAssign(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hDistance, dist.distance, sizeof(pDistanceFu) );
  cudaMemcpyFromSymbol(&hRadDay, dist.rad_decay, sizeof(pDecayFu) );
  cudaMemcpyFromSymbol(&hLRateDecay, dist.lrate_decay, sizeof(pDecayFu) );

  dist.distance = hDistance;
  dist.rad_decay = hRadDay;
  dist.lrate_decay = hLRateDecay;
}

这是经典的方法，但你注意到，它不再是动态的，因为设备符号必须引用函数引用，而不是指向运行时可能引用的指针。

// .. and this would work
#ifdef __CUDACC__
  __host__ __device__
#endif
inline float fcn_rad_decay (float sigma0, float T, float lambda) {
  return std::floor(sigma0*exp(-T/lambda) + 0.5f);
}

__device__ pDistanceFu pFoo= fcn_rad_decay; // pointer must target a reference, no host pointer possible 

void DeviceAssign2(DistFunction &dist) {      
  cudaMemcpyFromSymbol(&hLRateDecay, &fcn_rad_decay, sizeof(pDecayFu) );
  // the same:
  // cudaMemcpyFromSymbol(&hLRateDecay, pFoo, sizeof(pDecayFu) );
  // ..

  dist.lrate_decay = hLRateDecay;
  // ..
}

CUDA：将动态创建的CPU上的函数指针数组复制到GPU内存中

3 个答案: