acosf()的准确矢量化实现

时间:2018-03-03 00:27:01

标签: algorithm math floating-point simd

如果平台支持融合乘法加法(FMA),acosf()的简单实现可以很容易地实现相对于无限精确(数学)结果的1.5 ulp的误差界限。这意味着在舍入到最接近或偶数模式的正确舍入结果中,结果永远不会超过一个ulp。

然而,这样的实现通常包括两个主要代码分支,其将主要近似间隔[0,1]大致分成两半,如下面的示例性代码中那样。在针对SIMD架构时,这种分支抑制了编译器的自动矢量化。

是否有另一种算法方法可以更容易地实现自动矢量化,同时保持1.5 ulps的相同误差范围?可以假设对FMA的平台支持。

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}

3 个答案:

答案 0 :(得分:4)

我最接近一个令人满意的解决方案是基于罗伯特哈利的news posting的想法,他在其中观察到[0,1]中的x,acos(x)≈√(2) *(1-x)),并且多项式可以提供使整个区间内的精确近似所需的比例因子。从下面的代码中可以看出,这种方法产生了直线代码,只需使用三元运算符来处理负半平面中的参数。

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define VECTORIZABLE 1
#define ARR_LEN      (1 << 24)
#define MAX_ULP      1 /* deviation from correctly rounded result */

#if VECTORIZABLE  
/* 
 Compute arccos(a) with a maximum error of 1.496766 ulp 
 This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
 https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}

#else // VECTORIZABLE

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
#endif // VECTORIZABLE

int main (void)
{
    double darg, dref;
    float ref, *a, *b;
    uint32_t argi, resi, refi;

    printf ("%svectorizable implementation of acos\n", 
            VECTORIZABLE ? "" : "non-");

    a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
    b = (float *)malloc (sizeof(b[0]) * ARR_LEN);

    argi = 0x00000000;
    do {

        for (int i = 0; i < ARR_LEN; i++) {
            memcpy (&a[i], &argi, sizeof(a[i]));
            argi++;
        }

        for (int i = 0; i < ARR_LEN; i++) {
            b[i] = my_acosf (a[i]);
        }

        for (int i = 0; i < ARR_LEN; i++) {
            darg = (double)a[i];
            dref = acos (darg);
            ref = (float)dref;
            memcpy (&refi, &ref, sizeof(refi));
            memcpy (&resi, &b[i], sizeof(resi));
            if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
                printf ("error > 1 ulp a[i]=% 14.6a  b[i]=% 14.6a  ref=% 14.6a  dref=% 21.13a\n", 
                        a[i], b[i], ref, dref);
                printf ("test FAILED\n");

                return EXIT_FAILURE;
            }
        }

        printf ("^^^^ argi = %08x\n", argi);
    } while (argi);

    printf ("test PASSED\n");

    free (a);
    free (b);

    return EXIT_SUCCESS;
}

虽然此代码的结构似乎有助于自动矢量化,但在使用Compiler Explorer提供的编译器定位AVX2时,我没有太多运气。在我上面的测试应用程序的内部循环的上下文中,似乎能够对此代码进行矢量化的唯一编译器是Clang。但是,只有当我指定-ffast-math时,Clang似乎才能做到这一点,然而,这会产生不希望的副作用,即将sqrtf()调用转换为通过rsqrt计算的近似平方根。我尝试了一些不太干扰的开关,例如-fno-honor-nans-fno-math-errno-fno-trapping-math,但my_acosf()即使我组合使用它也没有矢量化。

目前我已将上述代码手动翻译为AVX2 + FMA内在函数,如下所示:

#include "immintrin.h"

/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
    const __m256 zero= _mm256_set1_ps ( 0.0f);
    const __m256 two = _mm256_set1_ps ( 2.0f);
    const __m256 mtwo= _mm256_set1_ps (-2.0f);
    const __m256 c0  = _mm256_set1_ps ( 0x1.c86000p-22f); //  4.25032340e-7
    const __m256 c1  = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
    const __m256 c2  = _mm256_set1_ps ( 0x1.90c5c4p-18f); //  5.97197595e-6
    const __m256 c3  = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
    const __m256 c4  = _mm256_set1_ps ( 0x1.c3f78ap-16f); //  2.69393295e-5
    const __m256 c5  = _mm256_set1_ps ( 0x1.e8f446p-14f); //  1.16575764e-4
    const __m256 c6  = _mm256_set1_ps ( 0x1.6df072p-11f); //  6.97973708e-4
    const __m256 c7  = _mm256_set1_ps ( 0x1.3332a6p-8f);  //  4.68746712e-3
    const __m256 c8  = _mm256_set1_ps ( 0x1.555550p-5f);  //  4.16666567e-2
    const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f);  //  1.86637890e+0
    const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f);  //  1.68325555e+0
    __m256 s, r, t, m;

    s = two;
    t = mtwo;
    m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
    t = _mm256_blendv_ps (t, s, m);
    t = _mm256_fmadd_ps (x, t, s);
    s = _mm256_sqrt_ps (t);
    r = c0;
    r = _mm256_fmadd_ps (r, t, c1);
    r = _mm256_fmadd_ps (r, t, c2);
    r = _mm256_fmadd_ps (r, t, c3);
    r = _mm256_fmadd_ps (r, t, c4);
    r = _mm256_fmadd_ps (r, t, c5);
    r = _mm256_fmadd_ps (r, t, c6);
    r = _mm256_fmadd_ps (r, t, c7);
    r = _mm256_fmadd_ps (r, t, c8);
    r = _mm256_mul_ps (r, t);
    r = _mm256_fmadd_ps (r, s, s);
    t = _mm256_sub_ps (zero, r);
    t = _mm256_fmadd_ps (pi0, pi1, t);
    r = _mm256_blendv_ps (r, t, m);
    return r;
}

答案 1 :(得分:2)

问题中代码的无分支版本是可能的(几乎没有任何冗余工作,只有一些比较/混合为FMA创建常量),但IDK如果编译器将自动矢量化它。

如果所有元素都有sqrt,那么主要的额外工作是无用的fma / -|a| > -0.5625f,不幸的是在关键路径上。

asinf_core的参数为(r > -0.5625f) ? r : sqrtf (fmaf (0.5f, r, 0.5f))

与此同时,您(或编译器)可以在输出上混合FMA的系数。

如果您通过将pi/2常数放入一个float而不是使用2个常量被乘数创建fmaf来牺牲fmaf( condition?-1:2, asinf_core_result, condition ? pi/2 : 0) 常量的准确性,则可以

andps

因此,您可以在两个常量之间进行选择,或asinf_core一个具有SIMD比较结果的常量,以使其有条件地为零(例如x86 SSE)。

最终修正基于原始输入的范围检查,因此FP混合与asinf_core的FMA工作之间再次存在指令级并行性。

事实上,我们可以通过将常量输入与第二个条件的常量输入相混合,将其优化为asinf_core输出的前一个FMA。我们希望a_cmp = andnot( a>0.0f, a>=-1.0f)作为它的被乘数之一,所以我们可以通过否定常数来否定。 (SIMD实现可能会multiplier ^ (-0.0f & a_cmp),然后是multiplier,其中0之前有条件地完成。

输出上该FMA的附加常数为pi/2pipi + pi/2a。给出两个比较结果(对于非NaN情况,在r=-|a|vpermilps上),我们可以将其组合成2位整数并将其用作随机控制来选择FP常量从所有4个常数的向量,例如使用AVX vblendvps(带有可变控件的快速通道内随机播放)。即而不是混合4种不同的方式,使用shuffle作为2位LUT

如果我们这样做,我们也应该为乘法常数做,因为创建常数是主要成本。可变混合比x86上的shuffle更昂贵(通常为2 uops对1)。在Skylake上,变量混合(如vpermilps ymm,ymm,ymm)可以使用任何端口(而shuffle只能在端口5上运行)。有足够的ILP,这可能会影响整体uop吞吐量或整个ALU端口,而不是端口5.(Haswell上的可变混合对于端口5来说是2 uop,因此它严格地比vblendvps更差。)

我们将从-1,1,-2和2中选择。

使用三元运算符的标量,使用gcc7.3 -O3 -march=skylake -ffast-math自动向量化(使用8 rsqrtps)。自动向量化所需的快速数学:/不幸的是,gcc仍然使用vblendvps +牛顿迭代(没有FMA?!?),即使使用-mrecip=none, which I thought was supposed to disable this

使用clang5.0(具有相同选项)仅使用// I think this is far more than enough digits for float precision, but wouldn't hurt to use a standard constant instead of what I typed from memory. static const float pi_2 = 3.1415926535897932384626433 / 2; static const float pi = 3.1415926535897932384626433; //static const float pi_plus_pi_2 = 3.1415926535897932384626433 * 3.0 / 2; /* maximum error UNKNOWN, completely UNTESTED */ float my_acosf_branchless (float a) { float r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs bool a_in_range = !(a > 0.0f) && (a >= -1.0f); bool rsmall = (r > -0.5625f); float asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f)); float asinf_res = asinf_core(asinf_arg); #if 0 r = fmaf( rsmall?-1.0f:2.0f, asinf_res, rsmall ? pi_2 : 0); if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs /* arccos (-x) = pi - arccos(x) */ r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r); } #else float fma_mul = rsmall? -1.0f:2.0f; fma_mul = a_in_range ? -fma_mul : fma_mul; float fma_add = rsmall ? pi_2 : 0; fma_add = a_in_range ? fma_add + pi : fma_add; // to vectorize, turn the 2 conditions into a 2-bit integer. // Use vpermilps as a 2-bit LUT of float constants // clang doesn't see the LUT trick, but otherwise appears non-terrible at this blending. r = fmaf(asinf_res, fma_mul, fma_add); #endif return r; } 进行自动向量化。同时查看on the Godbolt compiler explorer。这可以编译,看起来可能是正确数量的指令,但是未经测试。

float

使用循环测试自动矢量化,该循环在1024个对齐的article_object.created_by.reputation.score元素的数组上运行;看看Godbolt链接。

TODO:内在版本。

答案 2 :(得分:2)

这不是一种替代的算法方法,但仍然如此 你可能会对这篇扩展的评论感兴趣。

似乎使用gcc,函数copysignf()比矢量化更容易 三元运算符。在下面的代码中,我重写了你的标量 solutiongcc -std=c99 -O3 -m64 -Wall -march=haswell -fno-math-errno 而不是三元运算符。

代码使用相当旧的gcc 4.9编译器进行矢量化 选项sqrtf()vsqrtps函数被向量化为#include <stdio.h> #include <immintrin.h> #include <math.h> float acosf_cpsgn (float a) { float r, s, t, pi2; /* s = (a < 0.0f) ? 2.0f : (-2.0f); */ s = copysignf(2.0f, -a); t = fmaf (s, a, 2.0f); s = sqrtf (t); r = 0x1.c86000p-22f; // 4.25032340e-7 r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6 r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6 r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6 r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5 r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4 r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4 r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3 r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2 r = r * t; r = fmaf (r, s, s); /* t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r */ /* r = (a < 0.0f) ? t : r; */ r = copysignf(r, a); pi2 = 0x1.ddcb02p+0f * 0.5f; /* no rounding here */ pi2 = pi2 - copysignf(pi2, a); /* no rounding here */ t = fmaf (pi2, 0x1.aee9d6p+0f, r); // PI-r return t; } float my_acosf (float a) { float r, s, t; s = (a < 0.0f) ? 2.0f : (-2.0f); t = fmaf (s, a, 2.0f); s = sqrtf (t); r = 0x1.c86000p-22f; // 4.25032340e-7 r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6 r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6 r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6 r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5 r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4 r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4 r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3 r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2 r = r * t; r = fmaf (r, s, s); t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r r = (a < 0.0f) ? t : r; return r; } /* The code from the next 2 functions is copied from the godbold link in Peter cordes' */ /* answer https://stackoverflow.com/a/49091530/2439725 and modified */ int autovec_test_a (float *__restrict dst, float *__restrict src) { dst = __builtin_assume_aligned(dst,32); src = __builtin_assume_aligned(src,32); for (int i=0 ; i<1024 ; i++ ) { dst[i] = my_acosf(src[i]); } return 0; } int autovec_test_b (float *__restrict dst, float *__restrict src) { dst = __builtin_assume_aligned(dst,32); src = __builtin_assume_aligned(src,32); for (int i=0 ; i<1024 ; i++ ) { dst[i] = acosf_cpsgn(src[i]); } return 0; } 指令。 Godbolt link is here.

{{1}}