Question

我正在使用Dobb博士的文章“ Optimizing Math-Intensive Applications with Fixed-Point Arithmetic ”中描述的Anthony Williams的定点库，使用Rhumb Line method来计算两个地理点之间的距离。

当点之间的距离很大（大于几公里）时，这种方法效果很好，但在较小的距离处非常差。最坏的情况是当两个点相等或接近相等时，结果是194米的距离，而我需要距离> 1米的精度至少1米。

通过与双精度浮点实现的比较，我将问题定位到fixed::sqrt()函数，该函数在较小值时表现不佳：

x       std::sqrt(x)    fixed::sqrt(x)  error
----------------------------------------------------
0       0               3.05176e-005    3.05176e-005
1e-005  0.00316228      0.00316334      1.06005e-006
2e-005  0.00447214      0.00447226      1.19752e-007
3e-005  0.00547723      0.0054779       6.72248e-007
4e-005  0.00632456      0.00632477      2.12746e-007
5e-005  0.00707107      0.0070715       4.27244e-007
6e-005  0.00774597      0.0077467       7.2978e-007
7e-005  0.0083666       0.00836658      1.54875e-008
8e-005  0.00894427      0.00894427      1.085e-009

通过将fixed::sqrt(0)视为特殊情况来纠正fixed::sqrt()的结果是微不足道的，但这不能解决小的非零距离问题，其中误差始于194米，随着距离的增加趋于零。我可能至少需要将精度提高到零的顺序。

上面链接的文章的第4页简要解释了fixed fixed::sqrt() const { unsigned const max_shift=62; uint64_t a_squared=1LL<<max_shift; unsigned b_shift=(max_shift+fixed_resolution_shift)/2; uint64_t a=1LL<<b_shift; uint64_t x=m_nVal; while(b_shift && a_squared>x) { a>>=1; a_squared>>=2; --b_shift; } uint64_t remainder=x-a_squared; --b_shift; while(remainder && b_shift) { uint64_t b_squared=1LL<<(2*b_shift-fixed_resolution_shift); int const two_a_b_shift=b_shift+1-fixed_resolution_shift; uint64_t two_a_b=(two_a_b_shift>0)?(a<<two_a_b_shift):(a>>-two_a_b_shift); while(b_shift && remainder<(b_squared+two_a_b)) { b_squared>>=2; two_a_b>>=1; --b_shift; } uint64_t const delta=b_squared+two_a_b; if((2*remainder)>delta) { a+=(1LL<<b_shift); remainder-=delta; if(b_shift) { --b_shift; } } } return fixed(internal(),a); }算法，但我很难跟上它，更不用说确定是否有可能改进它。该功能的代码如下：

m_nVal

请注意，int64_t是内部固定点表示值，它是fixed_resolution_shift，表示使用Q36.28格式（fixed::sqrt() = 28）。表示本身对于至少8个小数位具有足够的精度，并且由于赤道弧的一小部分对于大约0.14米的距离是有利的，因此限制不是定点表示。

使用rhumb line方法是该应用程序的标准主体建议，因此无法更改，并且无论如何在应用程序的其他地方或将来的应用程序中可能需要更准确的平方根功能。

问题：是否有可能提高#include <cmath> #include <iostream> #include "fixed.hpp" int main() { double error = 1.0 ; for( double x = 0.0; error > 1e-8; x += 1e-5 ) { double fixed_root = sqrt(fixed(x)).as_double() ; double std_root = std::sqrt(x) ; error = std::fabs(fixed_root - std_root) ; std::cout << x << '\t' << std_root << '\t' << fixed_root << '\t' << error << std::endl ; } }算法对小非零值的准确性，同时仍保持其有界和确定性收敛？

其他信息 用于生成上表的测试代码：

fixed fixed::sqrt() const
{
    uint64_t a = 0 ;            // root accumulator
    uint64_t remHi = 0 ;        // high part of partial remainder
    uint64_t remLo = m_nVal ;   // low part of partial remainder
    uint64_t testDiv ;
    int count = 31 + (fixed_resolution_shift >> 1); // Loop counter
    do 
    {
        // get 2 bits of arg
        remHi = (remHi << 2) | (remLo >> 62); remLo <<= 2 ;

        // Get ready for the next bit in the root
        a <<= 1;   

        // Test radical
        testDiv = (a << 1) + 1;    
        if (remHi >= testDiv) 
        {
            remHi -= testDiv;
            a += 1;
        }

    } while (count-- != 0);

    return fixed(internal(),a);
}

结论根据Justin Peel的解决方案和分析，并与"The Neglected Art of Fixed Point Arithmetic"中的算法进行比较，我对后者进行了如下调整：

{{1}}

虽然这提供了更高的精度，但我所需要的改进是无法实现的。单独的Q36.28格式提供了我需要的精度，但是不能在不损失几位精度的情况下执行sqrt（）。然而，一些横向思维提供了更好的解决方我的应用程序测试计算的距离与某个距离限制。后见之明的一个相当明显的解决方案是测试距离的平方与极限的平方！

Answer 1

鉴于sqrt(ab) = sqrt(a)sqrt(b)，那么你不能只捕获你的数量很小的情况并将其向上移动给定的位数，计算根并将其向后移动一半的比特数得到结果？

即

 sqrt(n) = sqrt(n.2^k)/sqrt(2^k)
         = sqrt(n.2^k).2^(-k/2)

E.g。对于任何小于2 ^ 8的n，选择k = 28。

Answer 2

原来的实施显然存在一些问题。我试图用代码当前完成的方式解决所有这些问题而感到沮丧，并最终采用不同的方法进行修复。我现在可以修复原版，但无论如何我更喜欢我的方式。

我将输入数字视为在Q64中开始，这与移位28相同，然后向后移回14（sqrt将其减半）。但是，如果您这样做，那么精度将限制为1/2 ^ 14 = 6.1035e-5，因为最后14位将为0.为了解决这个问题，我转移a和{{1}正确地并且继续填充数字我再次循环。代码可以更高效，更清晰，但我会把它留给别人。下面显示的精度与Q36.28一样好。如果将固定点sqrt与输入数字的浮点sqrt进行比较后，将其固定点截断（将其转换为固定点并返回），则错误大约为2e-9（我没有在下面的代码，但它需要一行更改）。这与Q36.28的最佳精度一致，即1/2 ^ 28 = 3.7529e-9。

顺便说一下，原始代码中的一个大错误是，从不考虑m = 0的术语，因此永远不能设置该位。无论如何，这是代码。享受！

remainder

程序的输出为

#include <iostream>
#include <cmath>

typedef unsigned long uint64_t;

uint64_t sqrt(uint64_t in_val)
{
    const uint64_t fixed_resolution_shift = 28;
    const unsigned max_shift=62;
    uint64_t a_squared=1ULL<<max_shift;
    unsigned b_shift=(max_shift>>1) + 1;
    uint64_t a=1ULL<<(b_shift - 1);

    uint64_t x=in_val;

    while(b_shift && a_squared>x)
    {
        a>>=1;
        a_squared>>=2;
        --b_shift;
    }

    uint64_t remainder=x-a_squared;
    --b_shift;

    while(remainder && b_shift)
    {
        uint64_t b_squared=1ULL<<(2*(b_shift - 1));
        uint64_t two_a_b=(a<<b_shift);

        while(b_shift && remainder<(b_squared+two_a_b))
        {
            b_squared>>=2;
            two_a_b>>=1;
            --b_shift;
        }
        uint64_t const delta=b_squared+two_a_b;
        if((remainder)>=delta && b_shift)
        {
            a+=(1ULL<<(b_shift - 1));
            remainder-=delta;
            --b_shift;
        }
    }
    a <<= (fixed_resolution_shift/2);
    b_shift = (fixed_resolution_shift/2) + 1;
    remainder <<= (fixed_resolution_shift);

    while(remainder && b_shift)
    {
        uint64_t b_squared=1ULL<<(2*(b_shift - 1));
        uint64_t two_a_b=(a<<b_shift);

        while(b_shift && remainder<(b_squared+two_a_b))
        {
            b_squared>>=2;
            two_a_b>>=1;
            --b_shift;
        }
        uint64_t const delta=b_squared+two_a_b;
        if((remainder)>=delta && b_shift)
        {
            a+=(1ULL<<(b_shift - 1));
            remainder-=delta;
            --b_shift;
        }
    }

    return a;
}

double fixed2float(uint64_t x)
{
    return static_cast<double>(x) * pow(2.0, -28.0);
}

uint64_t float2fixed(double f)
{
    return static_cast<uint64_t>(f * pow(2, 28.0));
}

void finderror(double num)
{
    double root1 = fixed2float(sqrt(float2fixed(num)));
    double root2 = pow(num, 0.5);
    std::cout << "input: " << num << ", fixed sqrt: " << root1 << " " << ", float sqrt: " << root2 << ", finderror: " << root2 - root1 << std::endl;
}

main()
{
    finderror(0);
    finderror(1e-5);
    finderror(2e-5);
    finderror(3e-5);
    finderror(4e-5);
    finderror(5e-5);
    finderror(pow(2.0,1));
    finderror(1ULL<<35);
}

Answer 3

我不确定你是如何从表中显示的fixed::sqrt()获得数字的。

这就是我的所作所为：

#include <stdio.h>
#include <math.h>

#define __int64 long long // gcc doesn't know __int64
typedef __int64 fixed;

#define FRACT 28

#define DBL2FIX(x) ((fixed)((double)(x) * (1LL << FRACT)))
#define FIX2DBL(x) ((double)(x) / (1LL << FRACT))

// De-++-ified code from
// http://www.justsoftwaresolutions.co.uk/news/optimizing-applications-with-fixed-point-arithmetic.html
fixed sqrtfix0(fixed num)
{
    static unsigned const fixed_resolution_shift=FRACT;

    unsigned const max_shift=62;
    unsigned __int64 a_squared=1LL<<max_shift;
    unsigned b_shift=(max_shift+fixed_resolution_shift)/2;
    unsigned __int64 a=1LL<<b_shift;

    unsigned __int64 x=num;

    unsigned __int64 remainder;

    while(b_shift && a_squared>x)
    {
        a>>=1;
        a_squared>>=2;
        --b_shift;
    }

    remainder=x-a_squared;
    --b_shift;

    while(remainder && b_shift)
    {
        unsigned __int64 b_squared=1LL<<(2*b_shift-fixed_resolution_shift);
        int const two_a_b_shift=b_shift+1-fixed_resolution_shift;
        unsigned __int64 two_a_b=(two_a_b_shift>0)?(a<<two_a_b_shift):(a>>-two_a_b_shift);
        unsigned __int64 delta;

        while(b_shift && remainder<(b_squared+two_a_b))
        {
            b_squared>>=2;
            two_a_b>>=1;
            --b_shift;
        }
        delta=b_squared+two_a_b;
        if((2*remainder)>delta)
        {
            a+=(1LL<<b_shift);
            remainder-=delta;
            if(b_shift)
            {
                --b_shift;
            }
        }
    }
    return (fixed)a;
}

// Adapted code from
// http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Digit-by-digit_calculation
fixed sqrtfix1(fixed num)
{
    fixed res = 0;
    fixed bit = (fixed)1 << 62; // The second-to-top bit is set
    int s = 0;

    // Scale num up to get more significant digits

    while (num && num < bit)
    {
        num <<= 1;
        s++;
    }

    if (s & 1)
    {
        num >>= 1;
        s--;
    }

    s = 14 - (s >> 1);

    while (bit != 0)
    {
        if (num >= res + bit)
        {
            num -= res + bit;
            res = (res >> 1) + bit;
        }
        else
        {
            res >>= 1;
        }

        bit >>= 2;
    }

    if (s >= 0) res <<= s;
    else res >>= -s;

    return res;
}

int main(void)
{
    double testData[] =
    {
        0,
        1e-005,
        2e-005,
        3e-005,
        4e-005,
        5e-005,
        6e-005,
        7e-005,
        8e-005,
    };
    int i;

    for (i = 0; i < sizeof(testData) / sizeof(testData[0]); i++)
    {
        double x = testData[i];
        fixed xf = DBL2FIX(x);

        fixed sqf0 = sqrtfix0(xf);
        fixed sqf1 = sqrtfix1(xf);

        double sq0 = FIX2DBL(sqf0);
        double sq1 = FIX2DBL(sqf1);

        printf("%10.8f:  "
               "sqrtfix0()=%10.8f / err=%e  "
               "sqrt()=%10.8f  "
               "sqrtfix1()=%10.8f / err=%e\n",
               x,
               sq0, fabs(sq0 - sqrt(x)),
               sqrt(x),
               sq1, fabs(sq1 - sqrt(x)));
    }

    printf("sizeof(double)=%d\n", (int)sizeof(double));

    return 0;
}

这就是我得到的（使用gcc和Open Watcom）：

0.00000000:  sqrtfix0()=0.00003052 / err=3.051758e-05  sqrt()=0.00000000  sqrtfix1()=0.00000000 / err=0.000000e+00
0.00001000:  sqrtfix0()=0.00311279 / err=4.948469e-05  sqrt()=0.00316228  sqrtfix1()=0.00316207 / err=2.102766e-07
0.00002000:  sqrtfix0()=0.00445557 / err=1.656955e-05  sqrt()=0.00447214  sqrtfix1()=0.00447184 / err=2.974807e-07
0.00003000:  sqrtfix0()=0.00543213 / err=4.509667e-05  sqrt()=0.00547723  sqrtfix1()=0.00547720 / err=2.438148e-08
0.00004000:  sqrtfix0()=0.00628662 / err=3.793423e-05  sqrt()=0.00632456  sqrtfix1()=0.00632443 / err=1.262553e-07
0.00005000:  sqrtfix0()=0.00701904 / err=5.202484e-05  sqrt()=0.00707107  sqrtfix1()=0.00707086 / err=2.060551e-07
0.00006000:  sqrtfix0()=0.00772095 / err=2.501943e-05  sqrt()=0.00774597  sqrtfix1()=0.00774593 / err=3.390476e-08
0.00007000:  sqrtfix0()=0.00836182 / err=4.783859e-06  sqrt()=0.00836660  sqrtfix1()=0.00836649 / err=1.086198e-07
0.00008000:  sqrtfix0()=0.00894165 / err=2.621519e-06  sqrt()=0.00894427  sqrtfix1()=0.00894409 / err=1.777289e-07
sizeof(double)=8

修改

我错过了上述sqrtfix1()对大型参数不适用的事实。它可以通过在参数上附加28个零来固定，并基本上计算它的精确整数平方根。这是以128位算术进行内部计算为代价的，但它非常简单：

fixed sqrtfix2(fixed num) { unsigned __int64 numl, numh; unsigned __int64 resl = 0, resh = 0; unsigned __int64 bitl = 0, bith = (unsigned __int64)1 << 26; numl = num << 28; numh = num >> (64 - 28); while (bitl | bith) { unsigned __int64 tmpl = resl + bitl; unsigned __int64 tmph = resh + bith + (tmpl < resl); tmph = numh - tmph - (numl < tmpl); tmpl = numl - tmpl; if (tmph & 0x8000000000000000ULL) { resl >>= 1; if (resh & 1) resl |= 0x8000000000000000ULL; resh >>= 1; } else { numl = tmpl; numh = tmph; resl >>= 1; if (resh & 1) resl |= 0x8000000000000000ULL; resh >>= 1; resh += bith + (resl + bitl < resl); resl += bitl; } bitl >>= 2; if (bith & 1) bitl |= 0x4000000000000000ULL; if (bith & 2) bitl |= 0x8000000000000000ULL; bith >>= 2; } return resl; }

它给出了与this answer几乎相同的结果（略微好于3.43597e + 10）：

0.00000000: sqrtfix0()=0.00003052 / err=3.051758e-05 sqrt()=0.00000000 sqrtfix2()=0.00000000 / err=0.000000e+00 0.00001000: sqrtfix0()=0.00311279 / err=4.948469e-05 sqrt()=0.00316228 sqrtfix2()=0.00316207 / err=2.102766e-07 0.00002000: sqrtfix0()=0.00445557 / err=1.656955e-05 sqrt()=0.00447214 sqrtfix2()=0.00447184 / err=2.974807e-07 0.00003000: sqrtfix0()=0.00543213 / err=4.509667e-05 sqrt()=0.00547723 sqrtfix2()=0.00547720 / err=2.438148e-08 0.00004000: sqrtfix0()=0.00628662 / err=3.793423e-05 sqrt()=0.00632456 sqrtfix2()=0.00632443 / err=1.262553e-07 0.00005000: sqrtfix0()=0.00701904 / err=5.202484e-05 sqrt()=0.00707107 sqrtfix2()=0.00707086 / err=2.060551e-07 0.00006000: sqrtfix0()=0.00772095 / err=2.501943e-05 sqrt()=0.00774597 sqrtfix2()=0.00774593 / err=3.390476e-08 0.00007000: sqrtfix0()=0.00836182 / err=4.783859e-06 sqrt()=0.00836660 sqrtfix2()=0.00836649 / err=1.086198e-07 0.00008000: sqrtfix0()=0.00894165 / err=2.621519e-06 sqrt()=0.00894427 sqrtfix2()=0.00894409 / err=1.777289e-07 2.00000000: sqrtfix0()=1.41419983 / err=1.373327e-05 sqrt()=1.41421356 sqrtfix2()=1.41421356 / err=1.851493e-09 34359700000.00000000: sqrtfix0()=185363.69654846 / err=5.097361e-06 sqrt()=185363.69655356 sqrtfix2()=185363.69655356 / err=1 .164153e-09

Answer 4

许多年前，我为我们的服装制造的小型计算机制作了一个演示程序。计算机有一个内置的平方根指令，我们构建了一个简单的程序来演示计算机在TTY上进行16位加/减/乘/除/平方根。唉，事实证明，平方根指令中存在严重错误，但我们承诺会演示该函数。因此，我们创建了值为1-255的平方数组，然后使用简单查找将输入的值与其中一个数组值进行匹配。该指数是平方根。

如何提高小值的定点平方根

4 个答案: