如果平台支持融合乘法加法(FMA),acosf()
的简单实现可以很容易地实现相对于无限精确(数学)结果的1.5 ulp的误差界限。这意味着在舍入到最接近或偶数模式的正确舍入结果中,结果永远不会超过一个ulp。
是否有另一种算法方法可以更容易地实现自动矢量化,同时保持1.5 ulps的相同误差范围?可以假设对FMA的平台支持。
/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
float r, s;
s = a * a;
r = 0x1.a7f260p-5f; // 5.17513156e-2
r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
r = r * s;
r = fmaf (r, a, a);
return r;
}
/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
float r;
r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
if (r > -0.5625f) {
/* arccos(x) = pi/2 - arcsin(x) */
r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
} else {
/* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
}
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
return r;
}
答案 0 :(得分:4)
我最接近一个令人满意的解决方案是基于罗伯特哈利的news posting的想法,他在其中观察到[0,1]中的x,acos(x)≈√(2) *(1-x)),并且多项式可以提供使整个区间内的精确近似所需的比例因子。从下面的代码中可以看出,这种方法产生了直线代码,只需使用三元运算符来处理负半平面中的参数。
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#define VECTORIZABLE 1
#define ARR_LEN (1 << 24)
#define MAX_ULP 1 /* deviation from correctly rounded result */
#if VECTORIZABLE
/*
Compute arccos(a) with a maximum error of 1.496766 ulp
This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
float r, s, t;
s = (a < 0.0f) ? 2.0f : (-2.0f);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
r = (a < 0.0f) ? t : r;
return r;
}
#else // VECTORIZABLE
/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
float r, s;
s = a * a;
r = 0x1.a7f260p-5f; // 5.17513156e-2
r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
r = r * s;
r = fmaf (r, a, a);
return r;
}
/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
float r;
r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
if (r > -0.5625f) {
/* arccos(x) = pi/2 - arcsin(x) */
r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
} else {
/* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
}
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
return r;
}
#endif // VECTORIZABLE
int main (void)
{
double darg, dref;
float ref, *a, *b;
uint32_t argi, resi, refi;
printf ("%svectorizable implementation of acos\n",
VECTORIZABLE ? "" : "non-");
a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
b = (float *)malloc (sizeof(b[0]) * ARR_LEN);
argi = 0x00000000;
do {
for (int i = 0; i < ARR_LEN; i++) {
memcpy (&a[i], &argi, sizeof(a[i]));
argi++;
}
for (int i = 0; i < ARR_LEN; i++) {
b[i] = my_acosf (a[i]);
}
for (int i = 0; i < ARR_LEN; i++) {
darg = (double)a[i];
dref = acos (darg);
ref = (float)dref;
memcpy (&refi, &ref, sizeof(refi));
memcpy (&resi, &b[i], sizeof(resi));
if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
printf ("error > 1 ulp a[i]=% 14.6a b[i]=% 14.6a ref=% 14.6a dref=% 21.13a\n",
a[i], b[i], ref, dref);
printf ("test FAILED\n");
return EXIT_FAILURE;
}
}
printf ("^^^^ argi = %08x\n", argi);
} while (argi);
printf ("test PASSED\n");
free (a);
free (b);
return EXIT_SUCCESS;
}
虽然此代码的结构似乎有助于自动矢量化,但在使用Compiler Explorer提供的编译器定位AVX2
时,我没有太多运气。在我上面的测试应用程序的内部循环的上下文中,似乎能够对此代码进行矢量化的唯一编译器是Clang。但是,只有当我指定-ffast-math
时,Clang似乎才能做到这一点,然而,这会产生不希望的副作用,即将sqrtf()
调用转换为通过rsqrt
计算的近似平方根。我尝试了一些不太干扰的开关,例如-fno-honor-nans
,-fno-math-errno
,-fno-trapping-math
,但my_acosf()
即使我组合使用它也没有矢量化。
目前我已将上述代码手动翻译为AVX2
+ FMA
内在函数,如下所示:
#include "immintrin.h"
/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
const __m256 zero= _mm256_set1_ps ( 0.0f);
const __m256 two = _mm256_set1_ps ( 2.0f);
const __m256 mtwo= _mm256_set1_ps (-2.0f);
const __m256 c0 = _mm256_set1_ps ( 0x1.c86000p-22f); // 4.25032340e-7
const __m256 c1 = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
const __m256 c2 = _mm256_set1_ps ( 0x1.90c5c4p-18f); // 5.97197595e-6
const __m256 c3 = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
const __m256 c4 = _mm256_set1_ps ( 0x1.c3f78ap-16f); // 2.69393295e-5
const __m256 c5 = _mm256_set1_ps ( 0x1.e8f446p-14f); // 1.16575764e-4
const __m256 c6 = _mm256_set1_ps ( 0x1.6df072p-11f); // 6.97973708e-4
const __m256 c7 = _mm256_set1_ps ( 0x1.3332a6p-8f); // 4.68746712e-3
const __m256 c8 = _mm256_set1_ps ( 0x1.555550p-5f); // 4.16666567e-2
const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f); // 1.86637890e+0
const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f); // 1.68325555e+0
__m256 s, r, t, m;
s = two;
t = mtwo;
m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
t = _mm256_blendv_ps (t, s, m);
t = _mm256_fmadd_ps (x, t, s);
s = _mm256_sqrt_ps (t);
r = c0;
r = _mm256_fmadd_ps (r, t, c1);
r = _mm256_fmadd_ps (r, t, c2);
r = _mm256_fmadd_ps (r, t, c3);
r = _mm256_fmadd_ps (r, t, c4);
r = _mm256_fmadd_ps (r, t, c5);
r = _mm256_fmadd_ps (r, t, c6);
r = _mm256_fmadd_ps (r, t, c7);
r = _mm256_fmadd_ps (r, t, c8);
r = _mm256_mul_ps (r, t);
r = _mm256_fmadd_ps (r, s, s);
t = _mm256_sub_ps (zero, r);
t = _mm256_fmadd_ps (pi0, pi1, t);
r = _mm256_blendv_ps (r, t, m);
return r;
}
答案 1 :(得分:2)
问题中代码的无分支版本是可能的(几乎没有任何冗余工作,只有一些比较/混合为FMA创建常量),但IDK如果编译器将自动矢量化它。
如果所有元素都有sqrt
,那么主要的额外工作是无用的fma
/ -|a| > -0.5625f
,不幸的是在关键路径上。
asinf_core
的参数为(r > -0.5625f) ? r : sqrtf (fmaf (0.5f, r, 0.5f))
。
与此同时,您(或编译器)可以在输出上混合FMA的系数。
如果您通过将pi/2
常数放入一个float
而不是使用2个常量被乘数创建fmaf
来牺牲fmaf( condition?-1:2, asinf_core_result, condition ? pi/2 : 0)
常量的准确性,则可以
andps
因此,您可以在两个常量之间进行选择,或asinf_core
一个具有SIMD比较结果的常量,以使其有条件地为零(例如x86 SSE)。
最终修正基于原始输入的范围检查,因此FP混合与asinf_core
的FMA工作之间再次存在指令级并行性。
事实上,我们可以通过将常量输入与第二个条件的常量输入相混合,将其优化为asinf_core
输出的前一个FMA。我们希望a_cmp = andnot( a>0.0f, a>=-1.0f)
作为它的被乘数之一,所以我们可以通过否定常数来否定。 (SIMD实现可能会multiplier ^ (-0.0f & a_cmp)
,然后是multiplier
,其中0
之前有条件地完成。
输出上该FMA的附加常数为pi/2
,pi
,pi + pi/2
或a
。给出两个比较结果(对于非NaN情况,在r=-|a|
和vpermilps
上),我们可以将其组合成2位整数并将其用作随机控制来选择FP常量从所有4个常数的向量,例如使用AVX vblendvps
(带有可变控件的快速通道内随机播放)。即而不是混合4种不同的方式,使用shuffle作为2位LUT !
如果我们这样做,我们也应该为乘法常数做,因为创建常数是主要成本。可变混合比x86上的shuffle更昂贵(通常为2 uops对1)。在Skylake上,变量混合(如vpermilps ymm,ymm,ymm
)可以使用任何端口(而shuffle只能在端口5上运行)。有足够的ILP,这可能会影响整体uop吞吐量或整个ALU端口,而不是端口5.(Haswell上的可变混合对于端口5来说是2 uop,因此它严格地比vblendvps
更差。)
我们将从-1,1,-2和2中选择。
使用三元运算符的标量,使用gcc7.3 -O3 -march=skylake -ffast-math
自动向量化(使用8 rsqrtps
)。自动向量化所需的快速数学:/不幸的是,gcc仍然使用vblendvps
+牛顿迭代(没有FMA?!?),即使使用-mrecip=none
, which I thought was supposed to disable this。
使用clang5.0(具有相同选项)仅使用// I think this is far more than enough digits for float precision, but wouldn't hurt to use a standard constant instead of what I typed from memory.
static const float pi_2 = 3.1415926535897932384626433 / 2;
static const float pi = 3.1415926535897932384626433;
//static const float pi_plus_pi_2 = 3.1415926535897932384626433 * 3.0 / 2;
/* maximum error UNKNOWN, completely UNTESTED */
float my_acosf_branchless (float a)
{
float r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
bool a_in_range = !(a > 0.0f) && (a >= -1.0f);
bool rsmall = (r > -0.5625f);
float asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f));
float asinf_res = asinf_core(asinf_arg);
#if 0
r = fmaf( rsmall?-1.0f:2.0f, asinf_res, rsmall ? pi_2 : 0);
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
#else
float fma_mul = rsmall? -1.0f:2.0f;
fma_mul = a_in_range ? -fma_mul : fma_mul;
float fma_add = rsmall ? pi_2 : 0;
fma_add = a_in_range ? fma_add + pi : fma_add;
// to vectorize, turn the 2 conditions into a 2-bit integer.
// Use vpermilps as a 2-bit LUT of float constants
// clang doesn't see the LUT trick, but otherwise appears non-terrible at this blending.
r = fmaf(asinf_res, fma_mul, fma_add);
#endif
return r;
}
进行自动向量化。同时查看on the Godbolt compiler explorer。这可以编译,看起来可能是正确数量的指令,但是未经测试。
float
使用循环测试自动矢量化,该循环在1024个对齐的article_object.created_by.reputation.score
元素的数组上运行;看看Godbolt链接。
TODO:内在版本。
答案 2 :(得分:2)
这不是一种替代的算法方法,但仍然如此 你可能会对这篇扩展的评论感兴趣。
似乎使用gcc,函数copysignf()
比矢量化更容易
三元运算符。在下面的代码中,我重写了你的标量
solution与gcc -std=c99 -O3 -m64 -Wall -march=haswell -fno-math-errno
而不是三元运算符。
代码使用相当旧的gcc 4.9编译器进行矢量化
选项sqrtf()
。
vsqrtps
函数被向量化为#include <stdio.h>
#include <immintrin.h>
#include <math.h>
float acosf_cpsgn (float a)
{
float r, s, t, pi2;
/* s = (a < 0.0f) ? 2.0f : (-2.0f); */
s = copysignf(2.0f, -a);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
/* t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r */
/* r = (a < 0.0f) ? t : r; */
r = copysignf(r, a);
pi2 = 0x1.ddcb02p+0f * 0.5f; /* no rounding here */
pi2 = pi2 - copysignf(pi2, a); /* no rounding here */
t = fmaf (pi2, 0x1.aee9d6p+0f, r); // PI-r
return t;
}
float my_acosf (float a)
{
float r, s, t;
s = (a < 0.0f) ? 2.0f : (-2.0f);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
r = (a < 0.0f) ? t : r;
return r;
}
/* The code from the next 2 functions is copied from the godbold link in Peter cordes' */
/* answer https://stackoverflow.com/a/49091530/2439725 and modified */
int autovec_test_a (float *__restrict dst, float *__restrict src) {
dst = __builtin_assume_aligned(dst,32);
src = __builtin_assume_aligned(src,32);
for (int i=0 ; i<1024 ; i++ ) {
dst[i] = my_acosf(src[i]);
}
return 0;
}
int autovec_test_b (float *__restrict dst, float *__restrict src) {
dst = __builtin_assume_aligned(dst,32);
src = __builtin_assume_aligned(src,32);
for (int i=0 ; i<1024 ; i++ ) {
dst[i] = acosf_cpsgn(src[i]);
}
return 0;
}
指令。 Godbolt link is here.
{{1}}