我遇到了一个奇怪的问题,我写的模拟。我最近重新构建了我的代码,使事情更清洁,更有条理。基本上(除其他外)我将有问题的CUDA函数移动(基本上是复制粘贴)到另一个文件。此函数使用asinh
来计算某些内容,以及sinh
和cosh
。我注意到的是,在移动之前,该函数产生的预期结果与手工计算值(excel)一致。在移动之后,双曲线函数被馈送相同的输入,但结果显着不同(asinh
中高达10%,sinh
中高达0.5%)。这有效地打破了我的模拟。我对该功能的其余部分充满信心。
编辑:
经过进一步测试,我发现有问题的角度(lambdaDegrees)的硬编码值 - 即double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) };
- 产生(好的)预期结果。在执行等式之前和之后测量角度,角度不变,但是如果没有对值进行硬编码,则会产生错误的结果。最奇怪的部分是简单地添加另一个诊断printf函数,导致该函数产生另一个(错误的)结果。我想知道它是否与我在GPU上设置回调函数的方式有关...也许多个线程同时使用该函数导致某些(一致的)未定义行为?
在对代码进行了一些调整后,我重现了错误。 getSAtLambda(printf语句)中x的预期值是1.268 ...结果是1.768 ...让我知道你的想法。
main.cu
//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
typedef double(*callbackFcn)(double*, int, double, double, int);
//on GPU global variables
extern __device__ double* fieldConstArray_GPU;
extern __device__ int arraySize_GPU;
extern __device__ callbackFcn callback_GPU;
__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__global__ void setupEnvironmentGPU(double* constArrayPtr);
__global__ void execute()
{
int thdInd{ blockIdx.x * blockDim.x + threadIdx.x };
callback_GPU(fieldConstArray_GPU, arraySize_GPU, (thdInd == 31487) ? 1233005.097 : ((115200 - thdInd) / 50000.0 * 6.371e6), 0.0, thdInd ); //3rd argument are example values
}
void setupEnvironment()
{// consts: [ B0, ILATDeg, L, L_norm, s_max ]
double fieldConstArray_h[]{ 3.12e-5, 72.0, 66717978.17, 10.47213595, 85670894.1 };
double* fieldConstants_d{ nullptr };
cudaMalloc((void **)&fieldConstants_d, 5 * sizeof(double));
cudaMemcpy(fieldConstants_d, fieldConstArray_h, 5 * sizeof(double), cudaMemcpyHostToDevice);
setupEnvironmentGPU <<< 1, 1 >>> (fieldConstants_d);
}
int main()
{
setupEnvironment();
int loops{ 0 };
while (loops < 3)
{
execute <<< 115200 / 256, 256 >>> ();
cudaDeviceSynchronize();
loops++;
}
return 0;
}
otherfunctions.cu
#include <cmath>
#include <iostream>
//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"
typedef double(*callbackFcn)(double*, int, double, double, int);
__device__ double* fieldConstArray_GPU{ nullptr };
__device__ int arraySize_GPU{ 7 };
__device__ callbackFcn callback_GPU{ nullptr };
__host__ __device__ double getSAtLambda(double* consts, int arrayLength, double lambdaDegrees, double simtime, int thdInd)
{//returns s in units of L
double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) };
if (simtime == 0.0 && thdInd == 31487) { printf("\n\ngetSAtLambda: %f, %f\n\n", lambdaDegrees, x); }
return (0.5 * consts[2] / sqrt(3.0)) * (x + sinh(x) * cosh(x));
}
__host__ __device__ double getLambdaAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp{ (-consts[1] / consts[4]) * s + consts[1] }; //-ILAT / s_max * s + ILAT
double s_tmp{ consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, thdInd) };
double dlambda{ 1.0 };
bool over{ 0 };
while (abs((s_tmp - s) / s) > 1e-4) //errorTolerance
{
while (1)
{
over = (s_tmp >= s);
if (over)
{
lambda_tmp += dlambda;
s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
if (s_tmp < s)
break;
}
else
{
lambda_tmp -= dlambda;
s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
if (s_tmp >= s)
break;
}
}
if (dlambda < 1e-4 / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
}
return lambda_tmp;
}
__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg{ getLambdaAtS(consts, arrayLength, s, simtime, thdInd) };
double lambda_rad{ lambda_deg * 3.1415927 / 180.0 };
double rnorm{ consts[3] * pow(cos(lambda_rad), 2) };
return -consts[0] / pow(rnorm, 3) * sqrt(1.0 + 3 * pow(sin(lambda_rad), 2));
}
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{
return (BFieldAtS(consts, arrayLength, s + consts[5], simtime, thdInd) - BFieldAtS(consts, arrayLength, s - consts[5], simtime, thdInd)) / (2 * consts[5]);
}
__global__ void setupEnvironmentGPU(double* constArrayPtr)
{
callback_GPU = gradBAtS; //sets pointer to callback function
arraySize_GPU = 7;
fieldConstArray_GPU = constArrayPtr;
}
答案 0 :(得分:0)
我的发现摘要:
关于Cuda 8.0:
上面的代码产生了正确的结果:
在以下情况下为asinh(x)生成错误的结果:
更新到CUDA 9.1修复了此问题。