Question

我希望将以下C代码转换为F＃（这是快速反平方根算法）：

float Q_rsqrt( float number )
{
    long i;
    float x2, y;

    x2 = number * 0.5F;
    y  = number;
    i  = * ( long * ) &y;   // Extract bit pattern
    i  = 0x5f3759df - ( i >> 1 );   
    y  = * ( float * ) &i;  // Convert back to float.
    y  = y * ( 1.5F - ( x2 * y * y ) );   

    return y;
}

Answer 1

首先你应该做一些研究。然后，如果你坚持指出你有什么问题。这是solution by Kit Eason。

let fastInvSqrt (n : float32) : float32 =
    let MAGIC_NUMBER : int32 = 0x5f3759df 
    let THREE_HALVES = 1.5f
    let x2 = n * 0.5f
    let i = MAGIC_NUMBER - (System.BitConverter.ToInt32(System.BitConverter.GetBytes(n), 0) >>> 1)
    let y = System.BitConverter.ToSingle(System.BitConverter.GetBytes(i), 0)
    y * (THREE_HALVES - (x2 * y * y))

// Examples:
let x = fastInvSqrt 4.0f
// Output: val x : float32 = 0.499153584f
let x' = 1. / sqrt(4.0)
// Output: val x' : float = 0.5

Answer 2

在性能和低级优化方面，前后测量通常是个好主意。快速反向技巧非常酷，但它近似于反平方，问题是这些天是否真的需要这样的棘手代码（在DOOM时代浮动性能是废话，技巧是惊人的）。

无论如何，我建立了一个简单的性能测试平台，以便将琐碎的实现与Kit Eason / lad2025提供的解决方案以及另一个不分配字节数组的解决方案进行比较。

open System
open System.Diagnostics
open System.Runtime.InteropServices

[<Literal>]
let MAGIC_NUMBER : int32 = 0x5f3759df 
[<Literal>]
let THREE_HALVES  = 1.5F
[<Literal>]
let HALF          = 0.5F

[<Literal>]
let OUTER         = 1000
[<Literal>]
let INNER         = 10000

let inline invSqr (x : float32) : float32 = 1.F / sqrt x

let fInvSqr (x : float32) : float32 =
  let x2 = x * 0.5f
  // Allocates two byte arrays creating GC pressure ==> hurts performance
  let i = MAGIC_NUMBER - (BitConverter.ToInt32(BitConverter.GetBytes(x), 0) >>> 1)
  let y = BitConverter.ToSingle(BitConverter.GetBytes(i), 0)
  y * (THREE_HALVES - (x2 * y * y))

// Susceptible to race conditions & endianess issues
[<StructLayout (LayoutKind.Explicit)>]
type Bits =
  struct
    [<FieldOffset(0)>]
    val mutable f: float32
    [<FieldOffset(0)>]
    val mutable i: int32
  end   

let mutable bits = Bits ()

let fInvSqr2 (x : float32) : float32 =
  let x2 = x * 0.5F
  bits.f <- x
  let i = MAGIC_NUMBER - (bits.i >>> 1)
  bits.i <- i
  let y = bits.f
  y * (THREE_HALVES - (x2 * y * y))


let timeIt n (a : unit -> 'T) : int64 * 'T = 
  let r = a ()

  let sw = Stopwatch ()

  sw.Start ()

  for i = 1 to n do
    ignore <| a ()

  sw.Stop ()

  sw.ElapsedMilliseconds, r

[<EntryPoint>]
let main argv = 

  let testCases =
    [|
      "invSqr"    , fun () -> 
                      let mutable sum = 0.F
                      for x = 1 to INNER do 
                        sum <- sum + invSqr (float32 x)
                      sum
      "fInvSqr"   , fun () ->
                      let mutable sum = 0.F
                      for x = 1 to INNER do 
                        sum <- sum + fInvSqr (float32 x)
                      sum
      "fInvSqr2"  , fun () ->
                      let mutable sum = 0.F
                      for x = 1 to INNER do 
                        sum <- sum + fInvSqr2 (float32 x)
                      sum
    |]

  for name, action in testCases do
    printfn "Running %s %d times..." name (OUTER*INNER)
    let elapsed, result = timeIt OUTER action
    printfn "... it took %d ms product result: %f" elapsed result

  0

我机器上的性能测试结果：

Running invSqr 10000000 times...
... it took 78 ms product result: 198.544600
Running fInvSqr 10000000 times...
... it took 311 ms product result: 198.358200
Running fInvSqr2 10000000 times...
... it took 49 ms product result: 198.358200
Press any key to continue . . .

所以我们看到fInvSqr实际上比普通解决方案慢3倍，很可能是因为字节分配。此外，GC的成本隐藏在这些数字中，可能会增加非确定性的性能降级。

fInvSqr2似乎表现稍好，但这里也有缺点

结果偏差0.1％
Bits技巧易受竞争条件影响（可修复）
Bits技巧对于endian问题是可以接受的（如果你在具有不同endianess的CPU上运行程序它可能会破坏）

性能提升是否值得弊端？由于程序可能不仅仅是通过逆平方运算构建的，因此实际上有效的性能增益可能要小得多。我很难想象一个场景，我会为性能做出预测，我今天选择快速反向技巧，但这一切都取决于你的背景。

将C代码转换为F＃

2 个答案: