我正在尝试通过将CUDA内核分成四个独立的内核来优化它们。我在代码的开头编写了所有这些内核的原型。
__global__ void knowles_flux__oligomers(double*, double*);
__global__ void knowles_flux__nucleus(double*, double*);
__global__ void knowles_flux__fibrils(double*, double*);
__global__ void knowles_flux__maxlength(double*, double*);
__device__ void calcFlux(double*, double*, double*);
... Code ...
__device__ void calcFlux(double* concs, double* fluxes, double* dt)
{
knowles_flux_fibrils<<< numBlocks, numThreads >>>(fluxes, concs);
cudaDeviceSynchronize();
knowles_flux_oligomers<<< 1, nc-1 >>>(fluxes, concs);
knowles_flux_nucleus<<< 1, 1 >>>(fluxes, concs);
knowles_flux_maxlength<<< 1, 1 >>>(fluxes, concs);
cudaDeviceSynchronize();
}
__global__ void knowles_flux_oligomers(double *fluxes, double *conc)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x + 1;
fluxes[idx] = 0;
}
__global__ void knowles_flux_nucleus(double *fluxes, double *conc)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x + nc - 1;
double frag_term = 0;
for (int s = idx+1; s < (maxlength); s++)
{
frag_term += conc[s];
}
fluxes[idx] = (kn)*pow(conc[0],(nc)) + 2*(km)*frag_term - 2*(ka)*conc[idx]*conc[0];
}
__global__ void knowles_flux_fibrils(double *fluxes, double *conc)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double frag_term = 0;
for (int s = idx+1; s < (maxlength); s++)
{
frag_term += conc[s];
}
fluxes[idx] = -(km)*(idx)*conc[idx] + 2*(km)*frag_term - 2*(ka)*conc[idx]*conc[0] + 2*(ka)*conc[idx-1]*conc[0];
}
__global__ void knowles_flux_maxlength(double *fluxes, double *conc)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x + maxlength - 1;
fluxes[idx] = -km*(idx)*conc[idx]+2*(ka)*conc[idx-1]*conc[0];
}
导致错误“fatbinary:致命错误:'多个Flux内核).sm_35.cubin'不是'keyword = value'格式”,其中'Multiple Flux Kernel)'部分是源文件的尾端我尝试编译。
也许我的google-fu很弱,但是我没有为这种错误提出任何建议。
答案 0 :(得分:1)
好的,问题与代码无关。该错误实际上与我试图编译的源文件的文件名有关。文件名最初是“GPU RKF45(可变步长,多通量内核).cu”。文件名中的逗号似乎导致了问题。摆脱它让它编译。</ p>