Question

我想做以下事情：

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}

void otherFunction(int n) {
}

int main(int argc, char **argv) {
    //// template argument deduction/substitution failed ////
    someFunction<int>(&otherFunction, thrust::make_tuple(1));
    return 0;
}

我尝试过：

当然，删除这两个参数中的一个会导致工作解决方案。
当我使用模板参数在someFunction中创建struct静态函数时，它可以正常工作。但在原始代码中someFunction是一个CUDA内核，所以我不能这样做。还有什么想法？
当我将thrust :: tuple更改为std :: tuple时，它可以正常工作。有没有办法用std :: tuple构建一个thrust :: tuple？

编辑：

为了更清楚：someFunction和otherFunction是__global__！

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
__global__ void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}

__global__ void otherFunction(int n) {
}
__constant__ void (*kfp)(int) = &otherFunction;

int testPassMain(int argc, char **argv) {
    void (*h_kfp)(int);
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
    return 0;
}

我在两个示例中都遇到了编译器错误：template argument deduction/substitution failed。

Answer 1

将函数指针及其参数作为thrust :: tuple传递给全局函数

这样的事情应该是可行的：

$ cat t1161.cu
#include <thrust/tuple.h>
#include <stdio.h>

template <typename T, typename T1>
__global__ void kernel(void (*fp)(T1), T params){ // "someFunction"

  fp(thrust::get<0>(params));
  fp(thrust::get<1>(params));
}

__device__ void df(int n){                        // "otherFunction"

  printf("parameter = %d\n", n);
}

__device__ void (*ddf)(int) = df;

int main(){

  void (*hdf)(int);
  thrust::tuple<int, int> my_tuple = thrust::make_tuple(1,2);
  cudaMemcpyFromSymbol(&hdf, ddf, sizeof(void *));
  kernel<<<1,1>>>(hdf, my_tuple);
  cudaDeviceSynchronize();
}


$ nvcc -o t1161 t1161.cu
$ cuda-memcheck ./t1161
========= CUDA-MEMCHECK
parameter = 1
parameter = 2
========= ERROR SUMMARY: 0 errors
$

如果您希望df成为__global__函数，则类似的方法也应该可行，您只需要正确考虑动态并行性案例。同样，只有略微变化才允许您将元组直接传递给子函数（即df，无论是设备函数还是内核）。我不清楚为什么你需要可变参数模板参数，如果你的参数很好地打包在推力元组中。

编辑：如果你可以将你的元组传递给子内核（我不明白你为什么不能这样做，因为根据你更新的例子，元组和子内核共享相同的可变参数参数包），那么你仍然可以使用这种方法来避免可变参数模板：

$ cat t1162.cu
#include <thrust/tuple.h>
#include <stdio.h>

template<typename T>
__global__ void someFunction(void (*fp)(T), T params) {
  fp<<<1,1>>>(params);
  cudaDeviceSynchronize();
}

__global__ void otherFunction(thrust::tuple<int> t) {
  printf("param 0 = %d\n", thrust::get<0>(t));
}

__global__ void otherFunction2(thrust::tuple<float, float> t) {
  printf("param 1 = %f\n", thrust::get<1>(t));
}
__device__ void (*kfp)(thrust::tuple<int>) = &otherFunction;
__device__ void (*kfp2)(thrust::tuple<float, float>) = &otherFunction2;

int main(int argc, char **argv) {
    void (*h_kfp)(thrust::tuple<int>);
    void (*h_kfp2)(thrust::tuple<float, float>);
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<<<1,1>>>(h_kfp, thrust::make_tuple(1));
    cudaDeviceSynchronize();
    cudaMemcpyFromSymbol(&h_kfp2, kfp2, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<<<1,1>>>(h_kfp2, thrust::make_tuple(0.5f, 1.5f));
    cudaDeviceSynchronize();
    return 0;
}
$ nvcc -arch=sm_35 -rdc=true -o t1162 t1162.cu -lcudadevrt
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t1162
========= CUDA-MEMCHECK
param 0 = 1
param 1 = 1.500000
========= ERROR SUMMARY: 0 errors
$

在功能方面（能够使用不同的参数包调度多个子内核）我没有看到任何能力上的差异，再次假设您的参数很好地打包在元组中。

Answer 2

快速而肮脏的解决方案是强制转换函数指针：

#include <thrust/tuple.h>
#include <tuple>

template<typename... Args>
__global__ void someFunction(void (*fp)(), thrust::tuple<Args...> params) {
    void (*kfp)(Args...) = (void (*)(Args...)) fp;
    kfp<<<1,1>>>(thrust::get<0>(params));
}

__global__ void otherFunction(int n) {
    printf("n = %d\n", n);
}
__constant__ void (*kfp)(int) = &otherFunction;

int testPassMain(int argc, char **argv) {
    void (*h_kfp)();
    cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
    someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
    return 0;
}

我愿意接受更好的解决方案！

将函数指针及其参数作为thrust :: tuple传递给全局函数

2 个答案: