Question

在数值计算中，通常需要将数字缩放到安全范围内。

例如，计算欧几里德距离：sqrt(a^2+b^2)。在这里，如果a或b的大小太小/太大，则可能发生下溢/上溢。

解决此问题的常用方法是将数字除以最大数量级数。但是，此解决方案是：

慢（划分慢）
造成一些额外的误差

因此，我认为不要将其除以最大的数量级，而应将其乘以2的幂的倒数。这似乎是一个更好的解决方案，例如：

乘法比除法要快
更高的精度，因为乘以2的幂是准确的

因此，我想创建一个小的效用函数，它具有这样的逻辑（^，我的意思是求幂）：

void getScaler(double value, double &scaler, double &scalerReciprocal) {
    int e = <exponent of value>;
    if (e<-1022) { scaler=2^-1022; scalerReciprocal = 2^1022; }
    } else if (e>1022) { scaler=2^1022; scalerReciprocal = 2^-1022; }
    } else { scaler=2^e; scalerReciprocal = 2^(2046-e); }
}

此函数应返回归一化的scaler和scalerReciprocal，它们都是2的幂，其中scaler接近value，而{{1} }是scalerReciprocal的倒数。

scaler / scaler允许的最大指数为scaleReciprocal（我不想使用次标准-1022..1022，因为次标准数可能很慢）。 / p>

什么是快速方法？可以使用纯浮点运算来完成吗？还是应该从scaler中提取指数，并使用简单的value进行逻辑运算？是否有某种技巧可以快速与（-）1022进行比较（因为范围是对称的）？

注意：if不必是最接近的2的幂。如果某些逻辑需要它，则scaler可以与最接近的值相差很小的2的幂。

Answer 1

函数int match(char* string, char* star) { if (string[0] == '\0' && star[0] == '\0') return 1; else if (star[0] == '*') return match(string, star+1); else if (string[0] == '\0') return 0; else if (string[0] == star[0]){ if (star[-1] == '*') { if (!match(string+1, star+1)) return match(string+1, star); } return match(string+1, star+1); } else if (string[0] != star[0] && star[-1] == '*') return match(string+1, star); else if (string[0] != star[0] && star[-1] != '*') return 0; }计算“ 2的幂”。由于s = get_scale(z)的分数位如果为零，则s的倒数只是一个（廉价的）整数减法：请参见函数s。

在x86上，inv_of_scale和get_scale可以使用clang进行高效编译。编译器clang将三元运算符转换为inv_of_scale和minsd，另请参阅Peter Cordes的comment。使用gcc时，将这些功能转换为x86内在函数代码（maxsd和get_scale_x86），see Godbolt。

请注意，C explicitly permits type-punning through a union, whereas C++ (c++11) has no such permission 尽管gcc 8.2和clang 7.0并没有抱怨联合，但是您可以改进 using the memcpy trick代替C 工会把戏。对代码的这种修改应该是微不足道的。该代码应正确处理次常态。

inv_of_scale_x86

输出看起来不错：

#include<stdio.h>
#include<stdint.h>
#include<immintrin.h>
/* gcc -Wall -m64 -O3 -march=sandybridge dbl_scale.c */

union dbl_int64{
    double d;
    uint64_t i;
};

double get_scale(double t){
    union dbl_int64 x;
    union dbl_int64 x_min;
    union dbl_int64 x_max;
    uint64_t mask_i;
           /* 0xFEDCBA9876543210 */
    x_min.i = 0x0010000000000000ull;
    x_max.i = 0x7FD0000000000000ull;
    mask_i =  0x7FF0000000000000ull;
    x.d = t;
    x.i = x.i & mask_i;                    /* Set fraction bits to zero, take absolute value */
    x.d = (x.d < x_min.d) ? x_min.d : x.d; /* If subnormal: set exponent to 1                */
    x.d = (x.d > x_max.d) ? x_max.d : x.d; /* If exponent is very large: set exponent to 7FD, otherwise the inverse is a subnormal */
    return x.d;
}

double get_scale_x86(double t){
    __m128d x = _mm_set_sd(t);
    __m128d x_min = _mm_castsi128_pd(_mm_set1_epi64x(0x0010000000000000ull));
    __m128d x_max = _mm_castsi128_pd(_mm_set1_epi64x(0x7FD0000000000000ull));
    __m128d mask  = _mm_castsi128_pd(_mm_set1_epi64x(0x7FF0000000000000ull));
            x     = _mm_and_pd(x, mask);
            x     = _mm_max_sd(x, x_min);
            x     = _mm_min_sd(x, x_max);
    return _mm_cvtsd_f64(x);
}

/* Compute the inverse 1/t of a double t with all zero fraction bits     */
/* and exponent between the limits of function get_scale                 */
/* A single integer subtraction is much less expensive than a            */
/* floating point division.                                               */
double inv_of_scale(double t){
    union dbl_int64 x;
                     /* 0xFEDCBA9876543210 */
    uint64_t inv_mask = 0x7FE0000000000000ull;
    x.d = t;
    x.i = inv_mask - x.i;
    return x.d;
}

double inv_of_scale_x86(double t){
    __m128i inv_mask = _mm_set1_epi64x(0x7FE0000000000000ull);
    __m128d x        = _mm_set_sd(t);
    __m128i x_i      = _mm_sub_epi64(inv_mask, _mm_castpd_si128(x));
    return _mm_cvtsd_f64(_mm_castsi128_pd(x_i));
}


int main(){
    int n = 14;
    int i;
    /* Several example values, 4.94e-324 is the smallest subnormal */
    double y[14] = { 4.94e-324, 1.1e-320,  1.1e-300,  1.1e-5,  0.7,  1.7,  123.1, 1.1e300,  
                     1.79e308, -1.1e-320,    -0.7, -1.7, -123.1,  -1.1e307};
    double z, s, u;

    printf("Portable code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale(z);
        u = inv_of_scale(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    printf("\nx86 specific SSE code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale_x86(z);
        u = inv_of_scale_x86(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    return 0;
}

向量化

函数Portable code: x pow_of_2 inverse pow2*inv x*inverse 4.940656e-324 2.225074e-308 4.494233e+307 1.000000e+00 2.220446e-16 1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 4.942713e-13 1.100000e-300 7.466109e-301 1.339386e+300 1.000000e+00 1.473324e+00 1.100000e-05 7.629395e-06 1.310720e+05 1.000000e+00 1.441792e+00 7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 1.400000e+00 1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.700000e+00 1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 1.923437e+00 1.100000e+300 6.696929e+299 1.493222e-300 1.000000e+00 1.642544e+00 1.790000e+308 4.494233e+307 2.225074e-308 1.000000e+00 3.982882e+00 -1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 -4.942713e-13 -7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 -1.400000e+00 -1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -1.700000e+00 -1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 -1.923437e+00 -1.100000e+307 5.617791e+306 1.780059e-307 1.000000e+00 -1.958065e+00 x86 specific SSE code: x pow_of_2 inverse pow2*inv x*inverse 4.940656e-324 2.225074e-308 4.494233e+307 1.000000e+00 2.220446e-16 1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 4.942713e-13 1.100000e-300 7.466109e-301 1.339386e+300 1.000000e+00 1.473324e+00 1.100000e-05 7.629395e-06 1.310720e+05 1.000000e+00 1.441792e+00 7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 1.400000e+00 1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.700000e+00 1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 1.923437e+00 1.100000e+300 6.696929e+299 1.493222e-300 1.000000e+00 1.642544e+00 1.790000e+308 4.494233e+307 2.225074e-308 1.000000e+00 3.982882e+00 -1.099790e-320 2.225074e-308 4.494233e+307 1.000000e+00 -4.942713e-13 -7.000000e-01 5.000000e-01 2.000000e+00 1.000000e+00 -1.400000e+00 -1.700000e+00 1.000000e+00 1.000000e+00 1.000000e+00 -1.700000e+00 -1.231000e+02 6.400000e+01 1.562500e-02 1.000000e+00 -1.923437e+00 -1.100000e+307 5.617791e+306 1.780059e-307 1.000000e+00 -1.958065e+00应该使用支持自动矢量化的编译器进行矢量化。以下的代码vectorizes very well with clang（无需编写SSE / AVX内部函数代码）。

get_scale

不幸的是，gcc找不到/* Test how well get_scale vectorizes: */ void get_scale_vec(double * __restrict__ t, double * __restrict__ x){ int n = 1024; int i; for (i = 0; i < n; i++){ x[i] = get_scale(t[i]); } }和vmaxpd指令。

Answer 2

根据wim的回答，这是另一种解决方案，它可以减少指令数量，从而更快。输出有些不同，但仍满足要求。

这个想法是使用位运算来解决边界情况：将01放在指数的lsb上，无论其值如何。因此，指数：

0变为1（-1023变为-1022）
2046变为2045（1023变为1022）
其他指数也进行了修改，但略有改变：与wim的解决方案相比（当指数lsb从00变为01时，该数字可以变成两倍）或减半（当10-> 01）或1/4（11-> 01时）

因此，此修改后的例程有效（而且我认为仅使用2 fast asm instructions即可解决问题是很酷的事情）：

#include<stdio.h>
#include<stdint.h>
#include<immintrin.h>
/* gcc -Wall -m64 -O3 -march=sandybridge dbl_scale.c */

union dbl_int64{
    double d;
    uint64_t i;
};

double get_scale(double t){
    union dbl_int64 x;
    uint64_t and_i;
    uint64_t or_i;
         /* 0xFEDCBA9876543210 */
    and_i = 0x7FD0000000000000ull;
    or_i =  0x0010000000000000ull;
    x.d = t;
    x.i = (x.i & and_i)|or_i;                     /* Set fraction bits to zero, take absolute value */
    return x.d;
}

double get_scale_x86(double t){
    __m128d x = _mm_set_sd(t);
    __m128d x_and = _mm_castsi128_pd(_mm_set1_epi64x(0x7FD0000000000000ull));
    __m128d x_or  = _mm_castsi128_pd(_mm_set1_epi64x(0x0010000000000000ull));
            x     = _mm_and_pd(x, x_and);
            x     = _mm_or_pd(x, x_or);
    return _mm_cvtsd_f64(x);
}

/* Compute the inverse 1/t of a double t with all zero fraction bits     */
/* and exponent between the limits of function get_scale                 */
/* A single integer subtraction is much less expensive than a            */
/* floating point division.                                               */
double inv_of_scale(double t){
    union dbl_int64 x;
                     /* 0xFEDCBA9876543210 */
    uint64_t inv_mask = 0x7FE0000000000000ull;
    x.d = t;
    x.i = inv_mask - x.i;
    return x.d;
}

double inv_of_scale_x86(double t){
    __m128i inv_mask = _mm_set1_epi64x(0x7FE0000000000000ull);
    __m128d x        = _mm_set_sd(t);
    __m128i x_i      = _mm_sub_epi64(inv_mask, _mm_castpd_si128(x));
    return _mm_cvtsd_f64(_mm_castsi128_pd(x_i));
}


int main(){
    int n = 14;
    int i;
    /* Several example values, 4.94e-324 is the smallest subnormal */
    double y[14] = { 4.94e-324, 1.1e-320,  1.1e-300,  1.1e-5,  0.7,  1.7,  123.1, 1.1e300,  
                     1.79e308, -1.1e-320,    -0.7, -1.7, -123.1,  -1.1e307};
    double z, s, u;

    printf("Portable code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale(z);
        u = inv_of_scale(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    printf("\nx86 specific SSE code:\n");
    printf("             x       pow_of_2        inverse       pow2*inv      x*inverse \n");
    for (i = 0; i < n; i++){  
        z = y[i];
        s = get_scale_x86(z);
        u = inv_of_scale_x86(s);
        printf("%14e %14e %14e %14e %14e\n", z, s, u, s*u, z*u);
    }

    return 0;
}

Answer 3

您可以使用

double frexp (double x, int* exp);

返回值是x的小数部分，exp是指数（减去偏移量）。

或者，下面的代码获取双精度的指数部分。

int get_exp(double *d) {
  long long *l = (long long *) d;
  return ((*l & (0x7ffLL << 52) )>> 52)-1023 ;
}

快速获得2的幂的幂（浮点数）的快速方法

3 个答案: