我想做以下事情:
#include <thrust/tuple.h>
#include <tuple>
template<typename... Args>
void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}
void otherFunction(int n) {
}
int main(int argc, char **argv) {
//// template argument deduction/substitution failed ////
someFunction<int>(&otherFunction, thrust::make_tuple(1));
return 0;
}
我尝试过:
someFunction
中创建struct
静态函数时,它可以正常工作。但在原始代码中someFunction
是一个CUDA内核,所以我不能这样做。还有什么想法?编辑:
为了更清楚:someFunction
和otherFunction
是__global__
!
#include <thrust/tuple.h>
#include <tuple>
template<typename... Args>
__global__ void someFunction(void (*fp)(Args...), thrust::tuple<Args...> params) {
}
__global__ void otherFunction(int n) {
}
__constant__ void (*kfp)(int) = &otherFunction;
int testPassMain(int argc, char **argv) {
void (*h_kfp)(int);
cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
return 0;
}
我在两个示例中都遇到了编译器错误:template argument deduction/substitution failed
。
答案 0 :(得分:2)
将函数指针及其参数作为thrust :: tuple传递给全局函数
这样的事情应该是可行的:
$ cat t1161.cu
#include <thrust/tuple.h>
#include <stdio.h>
template <typename T, typename T1>
__global__ void kernel(void (*fp)(T1), T params){ // "someFunction"
fp(thrust::get<0>(params));
fp(thrust::get<1>(params));
}
__device__ void df(int n){ // "otherFunction"
printf("parameter = %d\n", n);
}
__device__ void (*ddf)(int) = df;
int main(){
void (*hdf)(int);
thrust::tuple<int, int> my_tuple = thrust::make_tuple(1,2);
cudaMemcpyFromSymbol(&hdf, ddf, sizeof(void *));
kernel<<<1,1>>>(hdf, my_tuple);
cudaDeviceSynchronize();
}
$ nvcc -o t1161 t1161.cu
$ cuda-memcheck ./t1161
========= CUDA-MEMCHECK
parameter = 1
parameter = 2
========= ERROR SUMMARY: 0 errors
$
如果您希望df
成为__global__
函数,则类似的方法也应该可行,您只需要正确考虑动态并行性案例。同样,只有略微变化才允许您将元组直接传递给子函数(即df
,无论是设备函数还是内核)。我不清楚为什么你需要可变参数模板参数,如果你的参数很好地打包在推力元组中。
$ cat t1162.cu
#include <thrust/tuple.h>
#include <stdio.h>
template<typename T>
__global__ void someFunction(void (*fp)(T), T params) {
fp<<<1,1>>>(params);
cudaDeviceSynchronize();
}
__global__ void otherFunction(thrust::tuple<int> t) {
printf("param 0 = %d\n", thrust::get<0>(t));
}
__global__ void otherFunction2(thrust::tuple<float, float> t) {
printf("param 1 = %f\n", thrust::get<1>(t));
}
__device__ void (*kfp)(thrust::tuple<int>) = &otherFunction;
__device__ void (*kfp2)(thrust::tuple<float, float>) = &otherFunction2;
int main(int argc, char **argv) {
void (*h_kfp)(thrust::tuple<int>);
void (*h_kfp2)(thrust::tuple<float, float>);
cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
someFunction<<<1,1>>>(h_kfp, thrust::make_tuple(1));
cudaDeviceSynchronize();
cudaMemcpyFromSymbol(&h_kfp2, kfp2, sizeof(void *), 0, cudaMemcpyDeviceToHost);
someFunction<<<1,1>>>(h_kfp2, thrust::make_tuple(0.5f, 1.5f));
cudaDeviceSynchronize();
return 0;
}
$ nvcc -arch=sm_35 -rdc=true -o t1162 t1162.cu -lcudadevrt
$ CUDA_VISIBLE_DEVICES="1" cuda-memcheck ./t1162
========= CUDA-MEMCHECK
param 0 = 1
param 1 = 1.500000
========= ERROR SUMMARY: 0 errors
$
在功能方面(能够使用不同的参数包调度多个子内核)我没有看到任何能力上的差异,再次假设您的参数很好地打包在元组中。
答案 1 :(得分:0)
快速而肮脏的解决方案是强制转换函数指针:
#include <thrust/tuple.h>
#include <tuple>
template<typename... Args>
__global__ void someFunction(void (*fp)(), thrust::tuple<Args...> params) {
void (*kfp)(Args...) = (void (*)(Args...)) fp;
kfp<<<1,1>>>(thrust::get<0>(params));
}
__global__ void otherFunction(int n) {
printf("n = %d\n", n);
}
__constant__ void (*kfp)(int) = &otherFunction;
int testPassMain(int argc, char **argv) {
void (*h_kfp)();
cudaMemcpyFromSymbol(&h_kfp, kfp, sizeof(void *), 0, cudaMemcpyDeviceToHost);
someFunction<int><<<1,1>>>(h_kfp, thrust::make_tuple(1));
return 0;
}
我愿意接受更好的解决方案!