Question

在某些应用中，需要多个角度的正弦和余弦，其中角度是通过将相等大小的增量 incr 重复添加到起始值 base 来得出的。出于性能原因，代替为每个生成的角度调用sin()，cos()标准数学库函数（或可能是非标准sincos()函数），计算< em> sin（base）和 cos（base）一次，然后通过应用angle-sum formulas导出所有其他正弦和余弦：

sin（base + incr）= cos（incr）·sin（base）+ sin（incr）·cos（base）
cos（base + incr）= cos（incr）·cos（基础）-sin（incr）·sin（基础）

这仅需要对比例因子 sin（incr）和 cos（incr）进行一次预计算，无论执行多少次迭代。

此方法存在两个问题。如果增量很小，则 cos（incr）将是一个接近1的数字，当以有限精度浮点格式进行计算时，由于隐式减法消除而导致精度损失。此外，由于舍入运算不是按照数字优势形式 sin（base + incr）= sin（base）+ Adjust 进行排列的，因此会产生不必要的舍入误差。计算量 adjust 的幅度明显小于 sin（base）（类似于余弦）。

由于通常需要数十到数百个迭代步骤，所以这些错误会累积。一个如何以最有利于保持高精度的方式构造迭代计算？如果通过标准数学函数fma()和fmaf()公开了fused multiply-add操作（FMA），应该对算法进行哪些更改？

Answer 1

对正弦波应用half-angle formula可以解决问题中提到的两个影响准确性的问题：

sin（incr / 2）=√（（1-cos（incr））/ 2）⇒
  sin²（incr / 2）=（1-cos（incr））/ 2⇔
   2·sin²（incr / 2）= 1-cos（incr）⇔
   1-2·sin²（incr / 2）= cos（incr）

将其替换为原始公式将得到以下中间表示形式：

sin（base + incr）=（1-2·sin²（incr / 2））·sin（base）+ sin（incr）·cos（base）
cos（base + incr）=（1-2·sin²（incr / 2））·cos（基础）-sin（incr）·sin（基础）

通过对术语进行简单的重新排序就可以得出所需的公式形式：

sin（base + incr）= sin（base）+（sin（incr）·cos（base）-2·sin²（incr / 2）·sin（base））
cos（base + incr）= cos（base）-（2·sin²（incr / 2）·cos（base）+ sin（incr）·sin（base））

与原始公式一样，这仅需一次性计算两个比例因子，即 2·sin²（incr / 2）和 sin（incr） 。对于较小的增量，两者都很小：保留了完整的精度。

关于如何将FMA应用于此计算，有两种选择。一个人可以通过取消使用单个调整的方法来最大程度地减少操作次数，而改为使用两个调整，希望减少FMA操作的舍入误差（两次操作进行一次舍入）将补偿精度损失：

sin（base + incr）= fma（-2·sin²（incr / 2），sin（base），fma（sin（incr），cos（base），sin（base）））
cos（base + incr）= fma（-2·sin²（incr / 2），cos（base），fma（-sin（incr），sin（base），cos（base）））

另一种选择是将单个FMA应用于改进的公式，尽管目前尚不清楚两个乘法中的哪个应映射到FMA内部的未四舍五入的乘法中。

sin（base + incr）= sin（base）+ fma（sin（incr），cos（base），-2·sin²（incr / 2）·sin（base）） < br /> cos（base + incr）= cos（base）-fma（sin（incr），sin（base），2·sin²（incr / 2）·cos（base））

下面的脚手架通过生成许多（ base ， incr ）对来评估上面讨论的每个计算替代方案，然后为它们中的每一个迭代一定数量的步骤同时收集所有生成的正弦和余弦值的误差。据此，它为每个测试用例分别计算正弦，余弦的root-mean square error。最后报告了在所有生成的测试案例中观察到的最大RMS误差。

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define NAIVE    (1)
#define ROBUST   (2)
#define FAST     (3)
#define ACCURATE (4)
#define MODE (ACCURATE)

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    double sumerrsqS, sumerrsqC, rmsS, rmsC, maxrmsS = 0, maxrmsC = 0;
    double refS, refC, errS, errC;
    float base, incr, s0, c0, s1, c1, tt;
    int count, i;

    const int N = 100;  // # rotation steps per test case
    count = 2000000;    // # test cases (a pair of base and increment values)

#if MODE == NAIVE
    printf ("testing: NAIVE (without FMA)\n");
#elif MODE == FAST
    printf ("testing: FAST (without FMA)\n");
#elif MODE == ACCURATE
    printf ("testing: ACCURATE (with FMA)\n");
#elif MODE == ROBUST
    printf ("testing: ROBUST (with FMA)\n");
#else
#error unsupported MODE
#endif // MODE

    do {
        /* generate test case */
        base = (float)(KISS * 1.21e-10);      // starting angle, < 30 degrees
        incr = (float)(KISS * 2.43e-10 / N);  // increment, < 60/n degrees

        /* set up rotation parameters */
        s1 = sinf (incr);
#if MODE == NAIVE
        c1 = cosf (incr);
#else
        tt = sinf (incr * 0.5f);
        c1 = 2.0f * tt * tt;
#endif // MODE
        sumerrsqS = 0;
        sumerrsqC = 0;

        s0 = sinf (base); // initial sine
        c0 = cosf (base); // initial cosine

        /* run test case through N rotation steps */
        i = 0;
        do {         

            tt = s0; // old sine
#if MODE == NAIVE
            /* least accurate, 6 FP ops */
            s0 = c1 * tt + s1 * c0; // new sine
            c0 = c1 * c0 - s1 * tt; // new cosine
#elif MODE == ROBUST
            /* very accurate, 8 FP ops */
            s0 = ( s1 * c0 - c1 * tt) + tt; // new sine
            c0 = (-s1 * tt - c1 * c0) + c0; // new cosine
#elif MODE == FAST
            /* accurate and fast, 4 FP ops */
            s0 = fmaf (-c1, tt, fmaf ( s1, c0, tt)); // new sine
            c0 = fmaf (-c1, c0, fmaf (-s1, tt, c0)); // new cosine
#elif MODE == ACCURATE
            /* most accurate, 6 FP ops */
            s0 = tt + fmaf (s1, c0, -c1 * tt); // new sine
            c0 = c0 - fmaf (s1, tt,  c1 * c0); // new cosine
#endif // MODE
            i++;

            refS = sin (fma ((double)i, (double)incr, (double)base));
            refC = cos (fma ((double)i, (double)incr, (double)base));
            errS = ((double)s0 - refS) / refS;
            errC = ((double)c0 - refC) / refC;
            sumerrsqS = fma (errS, errS, sumerrsqS);
            sumerrsqC = fma (errC, errC, sumerrsqC);
        } while (i < N);

        rmsS = sqrt (sumerrsqS / N);
        rmsC = sqrt (sumerrsqC / N);
        if (rmsS > maxrmsS) maxrmsS = rmsS;
        if (rmsC > maxrmsC) maxrmsC = rmsC;

    } while (--count);

    printf ("max rms error sin = % 16.9e\n", maxrmsS);
    printf ("max rms error cos = % 16.9e\n", maxrmsC);

    return EXIT_SUCCESS;
}

测试支架的输出表明，基于FMA的最快替代方案优于问题中的朴素方法，而基于FMA的更精确替代方案是所考虑的替代方案中最准确的：

testing: NAIVE (without FMA)
max rms error sin =  4.837386842e-006
max rms error cos =  6.884047862e-006

testing: ROBUST (without FMA)
max rms error sin =  3.330292645e-006
max rms error cos =  4.297631502e-006

testing: FAST (with FMA)
max rms error sin =  3.532624939e-006
max rms error cos =  4.763623188e-006

testing: ACCURATE (with FMA)
max rms error sin =  3.330292645e-006
max rms error cos =  4.104813533e-006

Answer 2

如果您想在长迭代次数上最大化准确性，则可以在从前一个精确值生成增量结果的同时，递增计算精确值而无累积误差。

例如，如果您预先计算 sin（incr * 2 ^ x）和 cos（incr * 2 ^ x），则 x = 6 ...比如说31 ，那么您就可以在输出前64个值的同时，使用角度和公式一次计算每个 incr = 64 * n 的结果。

每64个值中，您都会舍弃增量生成的结果，而取而代之的是精确的值，因此长时间内不会累积任何错误。

此外，由于仅需要来自任何精确基数的64个增量结果，因此您可以预先计算直接从基数而不是先前的结果中计算出这些结果所需的64个正弦和余弦。

Answer 3

可以通过以下方式重新排列sin（base + incr）和cos（base + incr）的方程：

sin（base + incr）= cos（incr）·sin（base）+ sin（incr）·cos（base）
    sin（base + incr）= sin（base）+（1-cos（incr））·-sin（base）+ sin（incr）·cos（base）
    sin（base + incr）= sin（base）+ sin（incr）·（-1 / sin（incr）·（1-cos（incr））·sin（base）+ cos（base））
    sin（base + incr）= sin（base）+ sin（incr）·（-tan（incr / 2）·sin（base）+ cos（base））

cos（base + incr）= cos（incr）·cos（基础）-sin（incr）·sin（基础）
    cos（base + incr）= cos（base）-sin（incr）·（tan（incr / 2）·cos（base）+ sin（base））

这里我们使用公式（1-cos（x）/ sin（x）= tan（x / 2），例如，请参见here。尚不明显可以比其他方法带来更准确的结果，但实际上，效果很好，我们将在后面看到。

同样，这需要对两个比例因子 sin（incr）和 tan（incr / 2）进行一次预计算。在C语言中，我们可以使用4个fma-s编写公式：

        s0 = fmaf ( s1, fmaf (-tt, c1, c0), tt); // new sine
        c0 = fmaf (-s1, fmaf ( c0, c1, tt), c0); // new cosine

完整的更新测试代码位于此答案的结尾。使用gcc -O3 -Wall -m64 -march=skylake fastsincos.c -lm（GCC版本7.3），结果是：

testing: FAST (with FMA)
max rms error sin =  3.532624939e-06
max rms error cos =  4.763623188e-06

testing: ACCURATE (with FMA)
max rms error sin =  3.330292645e-06
max rms error cos =  4.104813533e-06

testing: FAST_ACC (with FMA)
max rms error sin =  3.330292645e-06
max rms error cos =  3.775300478e-06

在此测试中，新解决方案FAST_ACC确实比其他解决方案准确一些。

修改后的测试代码：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#define NAIVE    (1)
#define ROBUST   (2)
#define FAST     (3)
#define ACCURATE (4)
#define FAST_ACC (5)
#define MODE (FAST_ACC)

// Fixes via: Greg Rose, KISS: A Bit Too Simple. http://eprint.iacr.org/2011/007
static unsigned int z=362436069,w=521288629,jsr=362436069,jcong=123456789;
#define znew (z=36969*(z&0xffff)+(z>>16))
#define wnew (w=18000*(w&0xffff)+(w>>16))
#define MWC  ((znew<<16)+wnew)
#define SHR3 (jsr^=(jsr<<13),jsr^=(jsr>>17),jsr^=(jsr<<5)) /* 2^32-1 */
#define CONG (jcong=69069*jcong+13579)                     /* 2^32 */
#define KISS ((MWC^CONG)+SHR3)

int main (void)
{
    double sumerrsqS, sumerrsqC, rmsS, rmsC, maxrmsS = 0, maxrmsC = 0;
    double refS, refC, errS, errC;
    float base, incr, s0, c0, s1, c1, tt;
    int count, i;

    const int N = 100;  // # rotation steps per test case
    count = 2000000;    // # test cases (a pair of base and increment values)

#if MODE == NAIVE
    printf ("testing: NAIVE (without FMA)\n");
#elif MODE == FAST
    printf ("testing: FAST (without FMA)\n");
#elif MODE == ACCURATE
    printf ("testing: ACCURATE (with FMA)\n");
#elif MODE == ROBUST
    printf ("testing: ROBUST (with FMA)\n");
#elif MODE == FAST_ACC
    printf ("testing: FAST_ACC (with FMA)\n");
#else
#error unsupported MODE
#endif // MODE

    do {
        /* generate test case */
        base = (float)(KISS * 1.21e-10);      // starting angle, < 30 degrees
        incr = (float)(KISS * 2.43e-10 / N);  // increment, < 60/n degrees

        /* set up rotation parameters */
        s1 = sinf (incr);
#if MODE == NAIVE
        c1 = cosf (incr);
#elif MODE == FAST_ACC
        c1 = tanf (incr * 0.5f);
#else
        tt = sinf (incr * 0.5f);
        c1 = 2.0f * tt * tt;
#endif // MODE
        sumerrsqS = 0;
        sumerrsqC = 0;

        s0 = sinf (base); // initial sine
        c0 = cosf (base); // initial cosine

        /* run test case through N rotation steps */
        i = 0;
        do {         

            tt = s0; // old sine
#if MODE == NAIVE
            /* least accurate, 6 FP ops */
            s0 = c1 * tt + s1 * c0; // new sine
            c0 = c1 * c0 - s1 * tt; // new cosine
#elif MODE == ROBUST
            /* very accurate, 8 FP ops */
            s0 = ( s1 * c0 - c1 * tt) + tt; // new sine
            c0 = (-s1 * tt - c1 * c0) + c0; // new cosine
#elif MODE == FAST
            /* accurate and fast, 4 FP ops */
            s0 = fmaf (-c1, tt, fmaf ( s1, c0, tt)); // new sine
            c0 = fmaf (-c1, c0, fmaf (-s1, tt, c0)); // new cosine
#elif MODE == ACCURATE
            /* most accurate, 6 FP ops */
            s0 = tt + fmaf (s1, c0, -c1 * tt); // new sine
            c0 = c0 - fmaf (s1, tt,  c1 * c0); // new cosine
#elif MODE == FAST_ACC
            /* fast and accurate, 4 FP ops */
            s0 = fmaf ( s1, fmaf (-tt, c1, c0), tt); // new sine
            c0 = fmaf (-s1, fmaf ( c0, c1, tt), c0); // new cosine
#endif // MODE
            i++;

            refS = sin (fma ((double)i, (double)incr, (double)base));
            refC = cos (fma ((double)i, (double)incr, (double)base));
            errS = ((double)s0 - refS) / refS;
            errC = ((double)c0 - refC) / refC;
            sumerrsqS = fma (errS, errS, sumerrsqS);
            sumerrsqC = fma (errC, errC, sumerrsqC);
        } while (i < N);

        rmsS = sqrt (sumerrsqS / N);
        rmsC = sqrt (sumerrsqC / N);
        if (rmsS > maxrmsS) maxrmsS = rmsS;
        if (rmsC > maxrmsC) maxrmsC = rmsC;

    } while (--count);

    printf ("max rms error sin = % 16.9e\n", maxrmsS);
    printf ("max rms error cos = % 16.9e\n", maxrmsC);

    return EXIT_SUCCESS;
}

对于等距角度，快速，准确地迭代生成正弦和余弦

3 个答案: