Question

如果平台支持融合乘法加法（FMA），acosf()的简单实现可以很容易地实现相对于无限精确（数学）结果的1.5 ulp的误差界限。这意味着在舍入到最接近或偶数模式的正确舍入结果中，结果永远不会超过一个ulp。

然而，这样的实现通常包括两个主要代码分支，其将主要近似间隔[0,1]大致分成两半，如下面的示例性代码中那样。在针对SIMD架构时，这种分支抑制了编译器的自动矢量化。

是否有另一种算法方法可以更容易地实现自动矢量化，同时保持1.5 ulps的相同误差范围？可以假设对FMA的平台支持。

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}

Answer 1

我最接近一个令人满意的解决方案是基于罗伯特哈利的news posting的想法，他在其中观察到[0,1]中的x，acos（x）≈√（2） *（1-x）），并且多项式可以提供使整个区间内的精确近似所需的比例因子。从下面的代码中可以看出，这种方法产生了直线代码，只需使用三元运算符来处理负半平面中的参数。

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define VECTORIZABLE 1
#define ARR_LEN      (1 << 24)
#define MAX_ULP      1 /* deviation from correctly rounded result */

#if VECTORIZABLE  
/* 
 Compute arccos(a) with a maximum error of 1.496766 ulp 
 This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
 https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}

#else // VECTORIZABLE

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
#endif // VECTORIZABLE

int main (void)
{
    double darg, dref;
    float ref, *a, *b;
    uint32_t argi, resi, refi;

    printf ("%svectorizable implementation of acos\n", 
            VECTORIZABLE ? "" : "non-");

    a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
    b = (float *)malloc (sizeof(b[0]) * ARR_LEN);

    argi = 0x00000000;
    do {

        for (int i = 0; i < ARR_LEN; i++) {
            memcpy (&a[i], &argi, sizeof(a[i]));
            argi++;
        }

        for (int i = 0; i < ARR_LEN; i++) {
            b[i] = my_acosf (a[i]);
        }

        for (int i = 0; i < ARR_LEN; i++) {
            darg = (double)a[i];
            dref = acos (darg);
            ref = (float)dref;
            memcpy (&refi, &ref, sizeof(refi));
            memcpy (&resi, &b[i], sizeof(resi));
            if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
                printf ("error > 1 ulp a[i]=% 14.6a  b[i]=% 14.6a  ref=% 14.6a  dref=% 21.13a\n", 
                        a[i], b[i], ref, dref);
                printf ("test FAILED\n");

                return EXIT_FAILURE;
            }
        }

        printf ("^^^^ argi = %08x\n", argi);
    } while (argi);

    printf ("test PASSED\n");

    free (a);
    free (b);

    return EXIT_SUCCESS;
}

虽然此代码的结构似乎有助于自动矢量化，但在使用Compiler Explorer提供的编译器定位AVX2时，我没有太多运气。在我上面的测试应用程序的内部循环的上下文中，似乎能够对此代码进行矢量化的唯一编译器是Clang。但是，只有当我指定-ffast-math时，Clang似乎才能做到这一点，然而，这会产生不希望的副作用，即将sqrtf()调用转换为通过rsqrt计算的近似平方根。我尝试了一些不太干扰的开关，例如-fno-honor-nans，-fno-math-errno，-fno-trapping-math，但my_acosf()即使我组合使用它也没有矢量化。

目前我已将上述代码手动翻译为AVX2 + FMA内在函数，如下所示：

#include "immintrin.h"

/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
    const __m256 zero= _mm256_set1_ps ( 0.0f);
    const __m256 two = _mm256_set1_ps ( 2.0f);
    const __m256 mtwo= _mm256_set1_ps (-2.0f);
    const __m256 c0  = _mm256_set1_ps ( 0x1.c86000p-22f); //  4.25032340e-7
    const __m256 c1  = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
    const __m256 c2  = _mm256_set1_ps ( 0x1.90c5c4p-18f); //  5.97197595e-6
    const __m256 c3  = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
    const __m256 c4  = _mm256_set1_ps ( 0x1.c3f78ap-16f); //  2.69393295e-5
    const __m256 c5  = _mm256_set1_ps ( 0x1.e8f446p-14f); //  1.16575764e-4
    const __m256 c6  = _mm256_set1_ps ( 0x1.6df072p-11f); //  6.97973708e-4
    const __m256 c7  = _mm256_set1_ps ( 0x1.3332a6p-8f);  //  4.68746712e-3
    const __m256 c8  = _mm256_set1_ps ( 0x1.555550p-5f);  //  4.16666567e-2
    const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f);  //  1.86637890e+0
    const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f);  //  1.68325555e+0
    __m256 s, r, t, m;

    s = two;
    t = mtwo;
    m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
    t = _mm256_blendv_ps (t, s, m);
    t = _mm256_fmadd_ps (x, t, s);
    s = _mm256_sqrt_ps (t);
    r = c0;
    r = _mm256_fmadd_ps (r, t, c1);
    r = _mm256_fmadd_ps (r, t, c2);
    r = _mm256_fmadd_ps (r, t, c3);
    r = _mm256_fmadd_ps (r, t, c4);
    r = _mm256_fmadd_ps (r, t, c5);
    r = _mm256_fmadd_ps (r, t, c6);
    r = _mm256_fmadd_ps (r, t, c7);
    r = _mm256_fmadd_ps (r, t, c8);
    r = _mm256_mul_ps (r, t);
    r = _mm256_fmadd_ps (r, s, s);
    t = _mm256_sub_ps (zero, r);
    t = _mm256_fmadd_ps (pi0, pi1, t);
    r = _mm256_blendv_ps (r, t, m);
    return r;
}

Answer 2

问题中代码的无分支版本是可能的（几乎没有任何冗余工作，只有一些比较/混合为FMA创建常量），但IDK如果编译器将自动矢量化它。

如果所有元素都有sqrt，那么主要的额外工作是无用的fma / -|a| > -0.5625f，不幸的是在关键路径上。

asinf_core的参数为(r > -0.5625f) ? r : sqrtf (fmaf (0.5f, r, 0.5f))。

与此同时，您（或编译器）可以在输出上混合FMA的系数。

如果您通过将pi/2常数放入一个float而不是使用2个常量被乘数创建fmaf来牺牲fmaf( condition?-1:2, asinf_core_result, condition ? pi/2 : 0)常量的准确性，则可以

andps

因此，您可以在两个常量之间进行选择，或asinf_core一个具有SIMD比较结果的常量，以使其有条件地为零（例如x86 SSE）。

最终修正基于原始输入的范围检查，因此FP混合与asinf_core的FMA工作之间再次存在指令级并行性。

事实上，我们可以通过将常量输入与第二个条件的常量输入相混合，将其优化为asinf_core输出的前一个FMA。我们希望a_cmp = andnot( a>0.0f, a>=-1.0f)作为它的被乘数之一，所以我们可以通过否定常数来否定。（SIMD实现可能会multiplier ^ (-0.0f & a_cmp)，然后是multiplier，其中0之前有条件地完成。

输出上该FMA的附加常数为pi/2，pi，pi + pi/2或a。给出两个比较结果（对于非NaN情况，在r=-|a|和vpermilps上），我们可以将其组合成2位整数并将其用作随机控制来选择FP常量从所有4个常数的向量，例如使用AVX vblendvps（带有可变控件的快速通道内随机播放）。即而不是混合4种不同的方式，使用shuffle作为2位LUT ！

如果我们这样做，我们也应该为乘法常数做，因为创建常数是主要成本。可变混合比x86上的shuffle更昂贵（通常为2 uops对1）。在Skylake上，变量混合（如vpermilps ymm,ymm,ymm）可以使用任何端口（而shuffle只能在端口5上运行）。有足够的ILP，这可能会影响整体uop吞吐量或整个ALU端口，而不是端口5.（Haswell上的可变混合对于端口5来说是2 uop，因此它严格地比vblendvps更差。）

我们将从-1,1，-2和2中选择。

使用三元运算符的标量，使用gcc7.3 -O3 -march=skylake -ffast-math自动向量化（使用8 rsqrtps）。自动向量化所需的快速数学：/不幸的是，gcc仍然使用vblendvps +牛顿迭代（没有FMA？！？），即使使用-mrecip=none, which I thought was supposed to disable this。

使用clang5.0（具有相同选项）仅使用// I think this is far more than enough digits for float precision, but wouldn't hurt to use a standard constant instead of what I typed from memory. static const float pi_2 = 3.1415926535897932384626433 / 2; static const float pi = 3.1415926535897932384626433; //static const float pi_plus_pi_2 = 3.1415926535897932384626433 * 3.0 / 2; /* maximum error UNKNOWN, completely UNTESTED */ float my_acosf_branchless (float a) { float r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs bool a_in_range = !(a > 0.0f) && (a >= -1.0f); bool rsmall = (r > -0.5625f); float asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f)); float asinf_res = asinf_core(asinf_arg); #if 0 r = fmaf( rsmall?-1.0f:2.0f, asinf_res, rsmall ? pi_2 : 0); if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs /* arccos (-x) = pi - arccos(x) */ r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r); } #else float fma_mul = rsmall? -1.0f:2.0f; fma_mul = a_in_range ? -fma_mul : fma_mul; float fma_add = rsmall ? pi_2 : 0; fma_add = a_in_range ? fma_add + pi : fma_add; // to vectorize, turn the 2 conditions into a 2-bit integer. // Use vpermilps as a 2-bit LUT of float constants // clang doesn't see the LUT trick, but otherwise appears non-terrible at this blending. r = fmaf(asinf_res, fma_mul, fma_add); #endif return r; }进行自动向量化。同时查看on the Godbolt compiler explorer。这可以编译，看起来可能是正确数量的指令，但是未经测试。

float

使用循环测试自动矢量化，该循环在1024个对齐的article_object.created_by.reputation.score元素的数组上运行;看看Godbolt链接。

TODO：内在版本。

Answer 3

这不是一种替代的算法方法，但仍然如此你可能会对这篇扩展的评论感兴趣。

似乎使用gcc，函数copysignf()比矢量化更容易三元运算符。在下面的代码中，我重写了你的标量 solution与gcc -std=c99 -O3 -m64 -Wall -march=haswell -fno-math-errno 而不是三元运算符。

代码使用相当旧的gcc 4.9编译器进行矢量化选项sqrtf()。 vsqrtps函数被向量化为#include <stdio.h> #include <immintrin.h> #include <math.h> float acosf_cpsgn (float a) { float r, s, t, pi2; /* s = (a < 0.0f) ? 2.0f : (-2.0f); */ s = copysignf(2.0f, -a); t = fmaf (s, a, 2.0f); s = sqrtf (t); r = 0x1.c86000p-22f; // 4.25032340e-7 r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6 r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6 r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6 r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5 r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4 r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4 r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3 r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2 r = r * t; r = fmaf (r, s, s); /* t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r */ /* r = (a < 0.0f) ? t : r; */ r = copysignf(r, a); pi2 = 0x1.ddcb02p+0f * 0.5f; /* no rounding here */ pi2 = pi2 - copysignf(pi2, a); /* no rounding here */ t = fmaf (pi2, 0x1.aee9d6p+0f, r); // PI-r return t; } float my_acosf (float a) { float r, s, t; s = (a < 0.0f) ? 2.0f : (-2.0f); t = fmaf (s, a, 2.0f); s = sqrtf (t); r = 0x1.c86000p-22f; // 4.25032340e-7 r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6 r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6 r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6 r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5 r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4 r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4 r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3 r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2 r = r * t; r = fmaf (r, s, s); t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r r = (a < 0.0f) ? t : r; return r; } /* The code from the next 2 functions is copied from the godbold link in Peter cordes' */ /* answer https://stackoverflow.com/a/49091530/2439725 and modified */ int autovec_test_a (float *__restrict dst, float *__restrict src) { dst = __builtin_assume_aligned(dst,32); src = __builtin_assume_aligned(src,32); for (int i=0 ; i<1024 ; i++ ) { dst[i] = my_acosf(src[i]); } return 0; } int autovec_test_b (float *__restrict dst, float *__restrict src) { dst = __builtin_assume_aligned(dst,32); src = __builtin_assume_aligned(src,32); for (int i=0 ; i<1024 ; i++ ) { dst[i] = acosf_cpsgn(src[i]); } return 0; }指令。 Godbolt link is here.

{{1}}

acosf（）的准确矢量化实现

3 个答案: