我正在尝试将包含函数指针数组的结构复制到设备中。我无法弄清楚以下代码有什么问题。内核中的代码不起作用。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
#define N_MODELS 2
#define N_PARAMS 2
struct userData
{
float (*eval[N_MODELS]) (const float params[N_PARAMS]);
};
__device__ float add(const float params[N_PARAMS])
{
return params[0] + params[1];
}
__device__ float mult(const float params[N_PARAMS])
{
return params[0] * params[1];
}
// function pointer for device
__device__ float (*add_ptr)(const float params[N_PARAMS]) = add;
__device__ float (*mult_ptr)(const float params[N_PARAMS]) = mult;
__global__ void kernel(float *d_result,struct userData *d_user, const float *d_params)
{
//this is currently not working
*d_result = (d_user->eval[0]) (d_params);
printf("d_result = %g\n", *d_result);
}
int main(void)
{
//*************//
// struct part //
//*************//
// function pointer
float(*fpAdd)(const float params[N_PARAMS]);
float(*fpMult)(const float params[N_PARAMS]);
// copy function pointers to device
gpuErrchk(cudaMemcpyFromSymbol(&fpAdd, add_ptr, sizeof(void *)));
gpuErrchk(cudaMemcpyFromSymbol(&fpMult, mult_ptr, sizeof(void *)));
struct userData h_user;
h_user.eval[0] = add;
h_user.eval[1] = mult;
struct userData *d_user;
gpuErrchk(cudaMalloc(&d_user, sizeof(userData)));
gpuErrchk(cudaMemcpy(d_user, &h_user, sizeof(userData), cudaMemcpyHostToDevice));
// parameters
float h_params[N_PARAMS] = { 3.0f, 2.0f };
float *d_params;
gpuErrchk(cudaMalloc(&d_params, N_PARAMS*sizeof(float)));
gpuErrchk(cudaMemcpy(d_params, h_params, N_PARAMS*sizeof(float), cudaMemcpyHostToDevice));
// result
float h_result = 1.0f;
float *d_result;
gpuErrchk(cudaMalloc(&d_result, sizeof(float)));
gpuErrchk(cudaMemcpy(d_result, &h_result, sizeof(float), cudaMemcpyHostToDevice));
kernel << <1, 1 >> >(d_result, d_user, d_params);
gpuErrchk(cudaMemcpy(&h_result, d_result, sizeof(float), cudaMemcpyDeviceToHost));
printf("result = %g\n", h_result);
gpuErrchk(cudaFree(d_result));
gpuErrchk(cudaFree(d_params));
gpuErrchk(cudaFree(d_user));
return EXIT_SUCCESS;
}
答案 0 :(得分:1)
错误在于:
struct userData h_user;
h_user.eval[0] = add;
h_user.eval[1] = mult;
您使用错误的值填充结构。读取__device__
内存值以从设备获取函数指针后,需要使用这些值来填充函数结构,而不是设备函数的主机符号。所以这个:
struct userData h_user;
h_user.eval[0] = fpAdd;
h_user.eval[1] = fpMult;
应按预期工作,