Question

我遇到了一个奇怪的问题，我写的模拟。我最近重新构建了我的代码，使事情更清洁，更有条理。基本上（除其他外）我将有问题的CUDA函数移动（基本上是复制粘贴）到另一个文件。此函数使用asinh来计算某些内容，以及sinh和cosh。我注意到的是，在移动之前，该函数产生的预期结果与手工计算值（excel）一致。在移动之后，双曲线函数被馈送相同的输入，但结果显着不同（asinh中高达10％，sinh中高达0.5％）。这有效地打破了我的模拟。我对该功能的其余部分充满信心。

编辑：经过进一步测试，我发现有问题的角度（lambdaDegrees）的硬编码值 - 即double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) }; - 产生（好的）预期结果。在执行等式之前和之后测量角度，角度不变，但是如果没有对值进行硬编码，则会产生错误的结果。最奇怪的部分是简单地添加另一个诊断printf函数，导致该函数产生另一个（错误的）结果。我想知道它是否与我在GPU上设置回调函数的方式有关...也许多个线程同时使用该函数导致某些（一致的）未定义行为？

在对代码进行了一些调整后，我重现了错误。 getSAtLambda（printf语句）中x的预期值是1.268 ...结果是1.768 ...让我知道你的想法。

main.cu

//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"

typedef double(*callbackFcn)(double*, int, double, double, int);

//on GPU global variables
extern __device__ double*     fieldConstArray_GPU;
extern __device__ int         arraySize_GPU;
extern __device__ callbackFcn callback_GPU;

__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__global__ void setupEnvironmentGPU(double* constArrayPtr);

__global__ void execute()
{
    int thdInd{ blockIdx.x * blockDim.x + threadIdx.x };
    callback_GPU(fieldConstArray_GPU, arraySize_GPU, (thdInd == 31487) ? 1233005.097 : ((115200 - thdInd) / 50000.0 * 6.371e6), 0.0, thdInd ); //3rd argument are example values
}

void setupEnvironment()
{// consts: [ B0, ILATDeg, L, L_norm, s_max ]
    double fieldConstArray_h[]{ 3.12e-5, 72.0, 66717978.17, 10.47213595, 85670894.1 };
    double* fieldConstants_d{ nullptr };

    cudaMalloc((void **)&fieldConstants_d, 5 * sizeof(double));
    cudaMemcpy(fieldConstants_d, fieldConstArray_h, 5 * sizeof(double), cudaMemcpyHostToDevice);

    setupEnvironmentGPU <<< 1, 1 >>> (fieldConstants_d);
}

int main()
{
    setupEnvironment();
    int loops{ 0 };

    while (loops < 3)
    {
        execute <<< 115200 / 256, 256 >>> ();
        cudaDeviceSynchronize();
        loops++;
    }

    return 0;
}

otherfunctions.cu

#include <cmath>
#include <iostream>

//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"

typedef double(*callbackFcn)(double*, int, double, double, int);

__device__ double*     fieldConstArray_GPU{ nullptr };
__device__ int         arraySize_GPU{ 7 };
__device__ callbackFcn callback_GPU{ nullptr };

__host__ __device__ double getSAtLambda(double* consts, int arrayLength, double lambdaDegrees, double simtime, int thdInd)
{//returns s in units of L
    double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) };

    if (simtime == 0.0 && thdInd == 31487) { printf("\n\ngetSAtLambda: %f, %f\n\n", lambdaDegrees, x); }

    return (0.5 * consts[2] / sqrt(3.0)) * (x + sinh(x) * cosh(x));
}

__host__ __device__ double getLambdaAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
    double lambda_tmp{ (-consts[1] / consts[4]) * s + consts[1] }; //-ILAT / s_max * s + ILAT
    double s_tmp{ consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, thdInd) };
    double dlambda{ 1.0 };
    bool   over{ 0 };

    while (abs((s_tmp - s) / s) > 1e-4) //errorTolerance
    {
        while (1)
        {
            over = (s_tmp >= s);
            if (over)
            {
                lambda_tmp += dlambda;
                s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
                if (s_tmp < s)
                    break;
            }
            else
            {
                lambda_tmp -= dlambda;
                s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
                if (s_tmp >= s)
                    break;
            }
        }
        if (dlambda < 1e-4 / 100.0) //errorTolerance
            break;
        dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
    }

    return lambda_tmp;
}

__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
    double lambda_deg{ getLambdaAtS(consts, arrayLength, s, simtime, thdInd) };
    double lambda_rad{ lambda_deg * 3.1415927 / 180.0 };
    double rnorm{ consts[3] * pow(cos(lambda_rad), 2) };

    return -consts[0] / pow(rnorm, 3) * sqrt(1.0 + 3 * pow(sin(lambda_rad), 2));
}

__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{
    return (BFieldAtS(consts, arrayLength, s + consts[5], simtime, thdInd) - BFieldAtS(consts, arrayLength, s - consts[5], simtime, thdInd)) / (2 * consts[5]);
}

__global__ void setupEnvironmentGPU(double* constArrayPtr)
{
    callback_GPU = gradBAtS; //sets pointer to callback function
    arraySize_GPU = 7;
    fieldConstArray_GPU = constArrayPtr;
}

Answer 1

我的发现摘要：

关于Cuda 8.0：

上面的代码产生了正确的结果：

编译为debug而不是release（-O1除外）
当使用asinh的trig标识而不是实际的asinh函数时
当asinh的参数被硬编码时
发布和调试时使用-O1而不是-O2
（矛盾的）直接调用函数getSAtLambda而不是通过函数指针

在以下情况下为asinh（x）生成错误的结果：

通过函数指针

更新到CUDA 9.1修复了此问题。

CUDA（来自C ++）双曲线Trig函数计算不同位置的不同结果

1 个答案: