我正在尝试根据仅支持64位双精度的BSON spec来序列化一个浮点数。所以我需要将我的浮动投射到双倍。
在sizeof(double) == 8
我只会做
float f = 3.14;
serialize((double)f);
但是由于我的目标系统上的sizeof(double) == 4
,我必须执行类似
float f = 3.14;
uint64_t d;
float32_to_float64(f, &d);
serialize(d);
我写了一些测试代码(在sizeof(double) == 8)
试图正确地将float32转换为float64并将结果存储为uint64_t的机器上,但我没有得到预期的结果。
#include <stdio.h>
#include <stdint.h>
#define FLOAT_FRACTION_MSK 0xFFFFFF
#define DOUBLE_FRACTION_S 52 // Fraction is 52 bits
#define DOUBLE_EXPONENT_S 11 // Exponent is 11 bits
#define FLOAT_FRACTION_S 23 // Fraction is 23 bits
#define FLOAT_EXPONENT_S 8 // Exponent is 8 bits
int main(void) {
// float af = 3.14;
float af = 0.15625;
double bd = 0;
//uint8_t buff[sizeof(int64_t)] = {0};
*(uint64_t*)&bd |= (*(uint32_t*)&af & (1UL << 31)) << 32; // check sign bit
uint8_t exponent32 = (*(uint32_t*)&af & 0x7F800000) >> (FLOAT_FRACTION_S+1);
if (exponent32 == 0xFF) return 1; // Error (infiniti if fraction is zero,
// Nan ortherwise)
printf("exponent32=%.4x\n", exponent32);
int64_t temp = *(uint64_t*)&bd;
*(uint64_t*)&bd |= ((uint64_t)exponent32 << (DOUBLE_FRACTION_S+4)); //& 0x7FF0000000000000; // (33); // 28
printf("exponent64=%llx, %d\n", *(uint64_t*)&bd, (DOUBLE_FRACTION_S+4));
// Do the fraction
{
printf("fraction64=%#.8llx\n", (
(uint64_t)(
(*(uint32_t*)&af & FLOAT_FRACTION_MSK) // + ((exponent32 != 0) ? (1<<24) : 0)
) << (DOUBLE_FRACTION_S-FLOAT_FRACTION_S-4)//((52-22)-1) // 33
) );
*(uint64_t*)&bd |= (
(uint64_t)(
(*(uint32_t*)&af & FLOAT_FRACTION_MSK) // + ((exponent32 != 0) ? (1<<24) : 0)
) << (DOUBLE_FRACTION_S-FLOAT_FRACTION_S)
) ;
}
double expected = af;
printf("Original float=%#.4x, converted double=%#.8llx expected=%.8llx,\n", *(uint32_t*)&af, *(uint64_t*)&bd, *(uint64_t*)&expected);
printf("Original float=%f, converted double=%lf\n\n", af, bd);
*(uint64_t*)&bd = temp;
return 0;
}
此输出提供Original float=0x3e200000, converted double=0x3e04000000000000 expected=3fc4000000000000,
因此,在转换指数时,我似乎错过了一些东西,但我不知道那是什么。
答案 0 :(得分:2)
固定非正规,无限和&amp; NaN的
unsigned __int64 Float2Double(float v)
{
unsigned int f = *(unsigned int*)&v; // reinterpret
if ( !(f&0x7fffffff) )
return (unsigned __int64)f<<32; // return +/-0.0
unsigned int s = f>>31; // get sign
unsigned int e = ((f&0x7f800000)>>23) -128; // get exponent and unbias from 128
unsigned int m = f&0x007fffff; // get mantisa
if (e==-128)
{
// handle denormals
while ( !(m&0x00800000) )
{
m<<=1;
e--;
}
m&=0x007fffff; // remove implicit 1
e++; //
}
else
if (e==127)
{
// +/-infinity
e = 1023;
}
unsigned __int64 d = s; // store sign (in lowest bit)
d <<= 11; // make space for exponent
d |= e +1024; // store rebiased exponent
d <<= 23; // add space for 23 most significant bits of mantisa
d |= m; // store 23 bits of mantisa
d <<= 52-23; // trail zeros in place of lower significant bit of mantisa
return d;
}
答案 1 :(得分:1)
接受适用于所有float
的答案。
使用所有float
成功测试,包括典型的正常目标,子法线,+ / - 0,+ / - 无穷大和NaN。
#include <assert.h>
#include <math.h>
#include <stdint.h>
#define F_SIGN_SHIFT (31)
#define F_EXPO_MAX (0xFF)
#define F_EXPO_SHIFT (23)
#define F_EXPO_MASK ((uint32_t) F_EXPO_MAX << F_EXPO_SHIFT)
#define F_EXPO_BIAS (127)
#define F_SFCT_MASK (0x7FFFFF)
#define F_SFCT_IMPLIEDBIT (F_SFCT_MASK + 1)
#define D_SIGN_SHIFT (63)
#define D_EXPO_MAX (0x7FF)
#define D_EXPO_SHIFT (52)
#define D_EXPO_MASK ((uint64_t) D_EXPO_MAX << D_EXPO_SHIFT)
#define D_EXPO_BIAS (1023)
uint64_t IEEEbinary32float_to_IEEEbinary64int(float f) {
assert(sizeof f == sizeof(uint32_t));
union {
float f;
uint32_t u;
} x = { f };
uint64_t y;
y = (uint64_t) (x.u >> F_SIGN_SHIFT) << D_SIGN_SHIFT;
unsigned expo = (x.u & F_EXPO_MASK) >> F_EXPO_SHIFT;
uint32_t significant = x.u & F_SFCT_MASK;
if (expo > 0) {
if (expo == F_EXPO_MAX) { // Infinity NaN
expo = D_EXPO_MAX;
} else { // typical normal finite numbers
expo += D_EXPO_BIAS - F_EXPO_BIAS;
}
} else {
if (significant) { // Subnormal
expo += D_EXPO_BIAS - F_EXPO_BIAS + 1;
while ((significant & F_SFCT_IMPLIEDBIT) == 0) {
significant <<= 1;
expo--;
}
significant &= F_SFCT_MASK;
} else { // Zero
expo = 0;
}
}
y |= (uint64_t) expo << D_EXPO_SHIFT;
y |= (uint64_t) significant << (D_EXPO_SHIFT - F_EXPO_SHIFT);
return y;
}