快速获得2的幂的幂(浮点数)的快速方法

时间:2019-01-21 20:45:28

标签: c++ x86 floating-point ieee-754

在数值计算中,通常需要将数字缩放到安全范围内。

例如,计算欧几里德距离:sqrt(a^2+b^2)。在这里,如果ab的大小太小/太大,则可能发生下溢/上溢。

解决此问题的常用方法是将数字除以最大数量级数。但是,此解决方案是:

  • 慢(划分慢)
  • 造成一些额外的误差

因此,我认为不要将其除以最大的数量级,而应将其乘以2的幂的倒数。这似乎是一个更好的解决方案,例如:

  • 乘法比除法要快
  • 更高的精度,因为乘以2的幂是准确的

因此,我想创建一个小的效用函数,它具有这样的逻辑(^,我的意思是求幂):

void getScaler(double value, double &scaler, double &scalerReciprocal) {
    int e = <exponent of value>;
    if (e<-1022) { scaler=2^-1022; scalerReciprocal = 2^1022; }
    } else if (e>1022) { scaler=2^1022; scalerReciprocal = 2^-1022; }
    } else { scaler=2^e; scalerReciprocal = 2^(2046-e); }
}

此函数应返回归一化的scalerscalerReciprocal,它们都是2的幂,其中scaler接近value,而{{1} }是scalerReciprocal的倒数。

scaler / scaler允许的最大指数为scaleReciprocal(我不想使用次标准-1022..1022,因为次标准数可能很慢)。 / p>

什么是快速方法?可以使用纯浮点运算来完成吗?还是应该从scaler中提取指数,并使用简单的value进行逻辑运算?是否有某种技巧可以快速与(-)1022进行比较(因为范围是对称的)?

注意:if不必是最接近的2的幂。如果某些逻辑需要它,则scaler可以与最接近的值相差很小的2的幂。

3 个答案:

答案 0 :(得分:7)

函数int match(char* string, char* star) { if (string[0] == '\0' && star[0] == '\0') return 1; else if (star[0] == '*') return match(string, star+1); else if (string[0] == '\0') return 0; else if (string[0] == star[0]){ if (star[-1] == '*') { if (!match(string+1, star+1)) return match(string+1, star); } return match(string+1, star+1); } else if (string[0] != star[0] && star[-1] == '*') return match(string+1, star); else if (string[0] != star[0] && star[-1] != '*') return 0; } 计算“ 2的幂”。由于s = get_scale(z)的分数位 如果为零,则s的倒数只是一个(廉价的)整数减法:请参见函数s

在x86上,inv_of_scaleget_scale可以使用clang进行高效编译。 编译器clang将三元运算符转换为inv_of_scaleminsd, 另请参阅Peter Cordes的comment。 使用gcc时, 将这些功能转换为x86内在函数 代码(maxsdget_scale_x86),see Godbolt

请注意,C explicitly permits type-punning through a union, whereas C++ (c++11) has no such permission 尽管gcc 8.2和clang 7.0并没有抱怨联合,但是您可以改进 using the memcpy trick代替C 工会把戏。对代码的这种修改应该是微不足道的。 该代码应正确处理次常态。

inv_of_scale_x86

输出看起来不错:

#include<stdio.h>
#include<stdint.h>
#include<immintrin.h>
/* gcc -Wall -m64 -O3 -march=sandybridge dbl_scale.c */

union dbl_int64{
    double d;
    uint64_t i;
};

double get_scale(double t){
    union dbl_int64 x;
    union dbl_int64 x_min;
    union dbl_int64 x_max;
    uint64_t mask_i;
           /* 0xFEDCBA9876543210 */
    x_min.i = 0x0010000000000000ull;
    x_max.i = 0x7FD0000000000000ull;
    mask_i =  0x7FF0000000000000ull;
    x.d = t;
    x.i = x.i & mask_i;                    /* Set fraction bits to zero, take absolute value */
    x.d = (x.d < x_min.d) ? x_min.d : x.d; /* If subnormal: set exponent to 1                */
    x.d = (x.d > x_max.d) ? x_max.d : x.d; /* If exponent is very large: set exponent to 7FD, otherwise the inverse is a subnormal */
    return x.d;
}

double get_scale_x86(double t){
    __m128d x = _mm_set_sd(t);
    __m128d x_min = _mm_castsi128_pd(_mm_set1_epi64x(0x0010000000000000ull));
    __m128d x_max = _mm_castsi128_pd(_mm_set1_epi64x(0x7FD0000000000000ull));
    __m128d mask  = _mm_castsi128_pd(_mm_set1_epi64x(0x7FF0000000000000ull));
            x     = _mm_and_pd(x, mask);
            x     = _mm_max_sd(x, x_min);
            x     = _mm_min_sd(x, x_max);
    return _mm_cvtsd_f64(x);
}

/* Compute the inverse 1/t of a double t with all zero fraction bits     */
/* and exponent between the limits of function get_scale                 */
/* A single integer subtraction is much less expensive than a            */
/* floating point division.                                               */
double inv_of_scale(double t){
    union dbl_int64 x;
                     /* 0xFEDCBA9876543210 */
    uint64_t inv_mask = 0x7FE0000000000000ull;
    x.d = t;
    x.i = inv_mask - x.i;
    return x.d;
}

double inv_of_scale_x86(double t){
    __m128i inv_mask = _mm_set1_epi64x(0x7FE0000000000000ull);
    __m128d x        = _mm_set_sd(t);
    __m128i x_i      = _mm_sub_epi64(inv_mask, _mm_castpd_si128(x));
    return _mm_cvtsd_f64(_mm_castsi128_pd(x_i));
}


int main(){
    int n = 14;
    int i;
    /* Several example values, 4.94e-324 is the smallest subnormal */
    double y[14] = { 4.94e-324, 1.1e-320,  1.1e-300,  1.1e-5,  0.7,  1.7,  123.1, 1.1e300,  
                     1.79e308, -1.1e-320,    -0.7, -1.7, -123.1,  -1.1e307};
    double z, s, u;

    printf("Portable code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale(z);
        u = inv_of_scale(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    printf("\nx86 specific SSE code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale_x86(z);
        u = inv_of_scale_x86(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    return 0;
}

向量化

函数Portable code: x pow_of_2 inverse pow2*inv x*inverse 4.940656e-324 2.225074e-308 4.494233e+307 1.000000e+00 2.220446e-16 1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 4.942713e-13 1.100000e-300 7.466109e-301 1.339386e+300 1.000000e+00 1.473324e+00 1.100000e-05 7.629395e-06 1.310720e+05 1.000000e+00 1.441792e+00 7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 1.400000e+00 1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.700000e+00 1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 1.923437e+00 1.100000e+300 6.696929e+299 1.493222e-300 1.000000e+00 1.642544e+00 1.790000e+308 4.494233e+307 2.225074e-308 1.000000e+00 3.982882e+00 -1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 -4.942713e-13 -7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 -1.400000e+00 -1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -1.700000e+00 -1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 -1.923437e+00 -1.100000e+307 5.617791e+306 1.780059e-307 1.000000e+00 -1.958065e+00 x86 specific SSE code: x pow_of_2 inverse pow2*inv x*inverse 4.940656e-324 2.225074e-308 4.494233e+307 1.000000e+00 2.220446e-16 1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 4.942713e-13 1.100000e-300 7.466109e-301 1.339386e+300 1.000000e+00 1.473324e+00 1.100000e-05 7.629395e-06 1.310720e+05 1.000000e+00 1.441792e+00 7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 1.400000e+00 1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.700000e+00 1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 1.923437e+00 1.100000e+300 6.696929e+299 1.493222e-300 1.000000e+00 1.642544e+00 1.790000e+308 4.494233e+307 2.225074e-308 1.000000e+00 3.982882e+00 -1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 -4.942713e-13 -7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 -1.400000e+00 -1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -1.700000e+00 -1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 -1.923437e+00 -1.100000e+307 5.617791e+306 1.780059e-307 1.000000e+00 -1.958065e+00 应该使用支持自动矢量化的编译器进行矢量化。以下的 代码vectorizes very well with clang(无需编写SSE / AVX内部函数代码)。

get_scale

不幸的是,gcc找不到/* Test how well get_scale vectorizes: */ void get_scale_vec(double * __restrict__ t, double * __restrict__ x){ int n = 1024; int i; for (i = 0; i < n; i++){ x[i] = get_scale(t[i]); } } vmaxpd指令。

答案 1 :(得分:3)

根据wim的回答,这是另一种解决方案,它可以减少指令数量,从而更快。输出有些不同,但仍满足要求。

这个想法是使用位运算来解决边界情况:将01放在指数的lsb上,无论其值如何。因此,指数:

  • 0变为1(-1023变为-1022)
  • 2046变为2045(1023变为1022)
  • 其他指数也进行了修改,但略有改变:与wim的解决方案相比(当指数lsb从00变为01时,该数字可以变成两倍)或减半(当10-> 01)或1/4(11-> 01时)

因此,此修改后的例程有效(而且我认为仅使用2 fast asm instructions即可解决问题是很酷的事情):

#include<stdio.h>
#include<stdint.h>
#include<immintrin.h>
/* gcc -Wall -m64 -O3 -march=sandybridge dbl_scale.c */

union dbl_int64{
    double d;
    uint64_t i;
};

double get_scale(double t){
    union dbl_int64 x;
    uint64_t and_i;
    uint64_t or_i;
         /* 0xFEDCBA9876543210 */
    and_i = 0x7FD0000000000000ull;
    or_i =  0x0010000000000000ull;
    x.d = t;
    x.i = (x.i & and_i)|or_i;                     /* Set fraction bits to zero, take absolute value */
    return x.d;
}

double get_scale_x86(double t){
    __m128d x = _mm_set_sd(t);
    __m128d x_and = _mm_castsi128_pd(_mm_set1_epi64x(0x7FD0000000000000ull));
    __m128d x_or  = _mm_castsi128_pd(_mm_set1_epi64x(0x0010000000000000ull));
            x     = _mm_and_pd(x, x_and);
            x     = _mm_or_pd(x, x_or);
    return _mm_cvtsd_f64(x);
}

/* Compute the inverse 1/t of a double t with all zero fraction bits     */
/* and exponent between the limits of function get_scale                 */
/* A single integer subtraction is much less expensive than a            */
/* floating point division.                                               */
double inv_of_scale(double t){
    union dbl_int64 x;
                     /* 0xFEDCBA9876543210 */
    uint64_t inv_mask = 0x7FE0000000000000ull;
    x.d = t;
    x.i = inv_mask - x.i;
    return x.d;
}

double inv_of_scale_x86(double t){
    __m128i inv_mask = _mm_set1_epi64x(0x7FE0000000000000ull);
    __m128d x        = _mm_set_sd(t);
    __m128i x_i      = _mm_sub_epi64(inv_mask, _mm_castpd_si128(x));
    return _mm_cvtsd_f64(_mm_castsi128_pd(x_i));
}


int main(){
    int n = 14;
    int i;
    /* Several example values, 4.94e-324 is the smallest subnormal */
    double y[14] = { 4.94e-324, 1.1e-320,  1.1e-300,  1.1e-5,  0.7,  1.7,  123.1, 1.1e300,  
                     1.79e308, -1.1e-320,    -0.7, -1.7, -123.1,  -1.1e307};
    double z, s, u;

    printf("Portable code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale(z);
        u = inv_of_scale(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    printf("\nx86 specific SSE code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale_x86(z);
        u = inv_of_scale_x86(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    return 0;
}

答案 2 :(得分:2)

您可以使用

double frexp (double x, int* exp); 

返回值是x的小数部分,exp是指数(减去偏移量)。

或者,下面的代码获取双精度的指数部分。

int get_exp(double *d) {
  long long *l = (long long *) d;
  return ((*l & (0x7ffLL << 52) )>> 52)-1023 ;
}