如何添加两个带有相反符号的浮点数?

时间:2019-10-14 22:46:58

标签: c algorithm floating-point

为了娱乐并弄清楚浮点数如何工作,我正在尝试制作一个函数,该函数接受两个单精度浮点数并将它们加在一起。

到目前为止,我所做的工作对于相同的符号号是完美的,但是当数字具有相反的符号时,它会崩溃。我查看了许多问题和站点(UAFHow do you add 8-bit floating point with different signsICLAdding 32 bit floating point numbers.How to add and subtract 16 bit floating point half precision numbers?How to subtract IEEE 754 numbers?),但是那些提出减法的人大多将其描述为“基本相同但取而代之”,但我认为这没有什么帮助。无人机确实

  

负尾数的处理方法是先转换为2的补数,然后执行加法。执行加法后,结果将转换回符号幅度形式。

但似乎我不知道该怎么做。我发现thisthis解释了什么是有符号的大小以及如何在其和二进制补码之间进行转换,所以我尝试像这样进行转换:

manz = manx + ( ( (many | 0x01000000) ^ 0x007FFFFF) + 1);

并这样:

manz = manx + ( ( (many | 0x01000000) ^ 0x007FFFFF) + 1);
manz = ( ((manz - 1) ^ 0x007FFFFF) & 0xFEFFFFFF);

但是这些都不起作用。

尝试使用其他资料中描述的减法方法,我尝试以各种方式对负数的尾数求反:

manz = manx - many;
manz = manx + (many - (1<<23));
manz = manx + (many - (1<<24));
manz = manx + ( (many - (1<<23)) & 0x007FFFFF );
manz = manx + ( (many - (1<<23)) + 1);
manz = manx + ( (~many & 0x007FFFFF) + 1);
manz = manx + (~many + 1);
manz = manx + ( (many ^ 0x007FFFFF) + 1);
manz = manx + ( (many ^ 0x00FFFFFF) + 1);
manz = manx + ( (many ^ 0x003FFFFF) + 1);

这是应该根据符号处理加法的语句,它是在对齐尾数之后:

expz = expy;
if(signx != signy) { // opp sign
  if(manx < many) {
    signz = signy;
    manz = many + ((manx ^ 0x007FFFFF) + 1);
  } else if(manx > many) {
    signz = signx;
    manz = manx - ((many ^ 0x007FFFFF) + 1);
  } else { // x == y
    signz = 0x00000000;
    expz  = 0x00000000;
    manz  = 0x00000000;
  }
} else {
  signz = signx;
  manz  = manx + many;
}

这是紧随其后的代码,用于在溢出的情况下对数字进行归一化,当它们具有相同的符号时可以使用,但是我不确定在减去时它的工作方式是否有意义:

if(manz & 0x01000000) {
  expz++;
  manz = (manz >> 1) + (manz & 0x1);
}
manz &= 0x007FFFFF;

使用测试值-3.34632F34.8532413F,当答案应该为0x427E071663.506920时,我得到答案0x41FC0E2D31.506922) ),并使用测试值3.34632F-34.8532413F,当答案应为0xC27E0716-63.506920时,我得到答案0xC1FC0E2D-31.506922) )。


我能够通过更改减去时对浮点数进行规范化的方式来解决问题。

expz = expy;
if(signx != signy) { // opp sign
  if(manx < many) {
    signz = signy;
    manz  = many - manx;
  } else if(manx > many) {
    signz = signx;
    manz  = manx - many;
  } else { // x == y
    signz = 0x00000000;
    expz  = 0x00000000;
    manz  = 0x00000000;
  }
  // Normalize subtraction
  while((manz & 0x00800000) == 0 && manz) {
      manz <<= 1;
      expz--;
  }
} else {
  signz = signx;
  manz  = manx + many;
  // Normalize addition
  if(manz & 0x01000000) {
    expz++;
    manz = (manz >> 1) + ( (x & 0x2) ? (x & 0x1) : 0 ); // round even
  }
}
manz &= 0x007FFFFF;

1 个答案:

答案 0 :(得分:0)

  

如何添加两个带有相反符号的浮点数?

大多数时候你不知道。

对于所有不能依赖“溢出时进行二进制补码包装”的数字类型(例如浮点数,大数库等),您总会得到以下结果:

add_signed(v1, v2) {
    if( v1 < 0) {
        if( v2 < 0) {
            // Both negative
            return -add_unsigned(-v1, -v2);
        } else {
            // Different sign, v1 is negative
            return subtract_unsigned(v2, -v1);
        }
    } else {
        if( v2 < 0) {
            // Different sign, v2 is negative
            return subtract_unsigned(v1, -v2);
        } else {
            // Both positive
            return add_unsigned(v1, v2);
        }
    }
 }

subtract_signed(v1, v2) {
    return add_signed(v1, -v2);
}

add_unsigned(v1, v2) {
    // Here we know that v1 and v2 will never be negative, and
    //   we know that the result will never be negative
    ...
}

subtract_unsigned(v1, v2) {
    if(v1 < v2) {
        return -subtract_unsigned(v2, v1);
    }
    // Here we know that v1 and v2 will never be negative, and
    //   we know that the result will never be negative
    ...
}

换句话说;所有实际的加法和所有实际的减法都使用无符号(“永不为负”)的数字进行。

仅添加32位浮点仿真的更完整示例(在C语言中,未经测试,可能有错误,可能适用于反常态,可能不行,不支持“ NaN / s”或无穷大,不支持上溢或下溢,没有“保留尾数以减少舍入前的精度损失”,除“向零舍入”外,不支持其他舍入模式):

#define SIGN_FLAG      0x80000000U
#define EXPONENT_MASK  0x7F800000U
#define MANTISSA_MASK  0x007FFFFFU
#define IMPLIED_BIT    0x00800000U
#define OVERFLOW_BIT   0x01000000U
#define EXPONENT_ONE   0x00800000U

uint32_t add_signed(uint32_t v1, uint32_t v2) {
    if( (v1 & SIGN_FLAG) != 0) {
        if( (v2 & SIGN_FLAG) != 0) {
            // Both negative
            return SIGN_FLAG | add_unsigned(v1 & ~SIGN_FLAG, v2 & ~SIGN_FLAG);
        } else {
            // Different sign, v1 is negative
            return subtract_unsigned(v2, v1 & ~SIGN_FLAG);
        }
    } else {
        if( (v2 & SIGN_FLAG) != 0) {
            // Different sign, v2 is negative
            return subtract_unsigned(v1, v2 & ~SIGN_FLAG);
        } else {
            // Both positive
            return add_unsigned(v1, v2);
        }
    }
 }

uint32_t subtract_signed(uint32_t v1, uint32_t v2) {
    return add_signed(v1, v2 ^ SIGN_FLAG);
}

uint32_t add_unsigned(uint32_t v1, uint32_t v2) {
    // Here we know that v1 and v2 will never be negative, and
    //   we know that the result will never be negative

    if(v1 < v2) {    // WARNING: Compares both exponents and mantissas
        return add_unsigned(v2, v1);
    }

    // Here we know the exponent of v1 is not smaller than the exponent of v2

    uint32_t m1 = (v1 & MANTISSA_MASK) | IMPLIED_BIT;
    uint32_t m2 = (v2 & MANTISSA_MASK) | IMPLIED_BIT;
    uint32_t exp2 = v2 & EXPONENT_MASK;
    uint32_t expr = v1 & EXPONENT_MASK;

    while(exp2 < expr) {
        m2 >>= 1;
        exp2 += EXPONENT_ONE;
    }
    uint32_t mr = m1+m2;
    if( (mr & OVERFLOW_BIT) != 0) {
        mr >> 1;
        expr += EXPONENT_ONE;
    }
    return expr | (mr & ~IMPLIED_BIT);
}

uint32_t subtract_unsigned(uint32_t v1, uint32_t v2) {
    if(v1 == v2) {
        return 0;
    }
    if(v1 < v2) {
        return SIGN_FLAG ^ subtract_unsigned(v2, v1);
    }

    // Here we know the exponent of v1 is not smaller than the exponent of v2,
    //  and that (if exponents are equal) the mantissa of v1 is larger
    //  than the mantissa of v2; and therefore the result will be
    //  positive

    uint32_t m1 = (v1 & MANTISSA_MASK) | IMPLIED_BIT;
    uint32_t m2 = (v2 & MANTISSA_MASK) | IMPLIED_BIT;
    uint32_t exp2 = v2 & EXPONENT_MASK;
    uint32_t expr = v1 & EXPONENT_MASK;

    while(exp2 < expr) {
        m2 >>= 1;
        exp2 += EXPONENT_ONE;
    }
    uint32_t mr = m1-m2;
    while( (mr & IMPLIED_BIT) == 0) {
        mr <<= 1;
        expr -= EXPONENT_ONE;
    }
    return expr | (mr & ~IMPLIED_BIT);
}