映射2DArray时F#性能更好 - > arraymodule.mapindexed

时间:2016-04-08 15:59:42

标签: performance f#

如果没有第三方,处理这个2DArray会有什么更高效的方法?

#time
let ar = array2D[[5.0; 6.0; 7.0; 8.0]; [1.0; 2.0; 3.0; 4.0]]

[0..5000000]
let a2 = ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(coli + 6) * double(rowi + 7))

3 个答案:

答案 0 :(得分:7)

如果你运行上面的代码,它需要大约0ms,所以我真的取决于你调用它的上下文。如果你只是循环运行它1M次,那么我的机器需要大约600ms:

for i in 0 .. 1000000 do
  let a2 = ar |> Array2D.mapi(fun rowi coli value -> 
    (value + 1.6) * double ((coli + 6) * (rowi + 7)))
  ()

这里,大部分时间都花在分配结果数组上 - 对于每次迭代,我们需要分配一个新的2D数组来存储结果。这为您提供了很好的功能属性(结果可以共享,因为它们没有发生变异),但这就是它需要更长时间的原因。

你可以使用一些变异并避免这种情况。这取决于上下文,所以这就是为什么你可能在这里找不到有用的答案。

例如,在这个仿真的1M循环示例中,我可以只分配一个数组来存储结果,然后重复写入:

let res = ar |> Array2D.map id
for i in 0 .. 1000000 do
  for x in 0 .. ar.GetLength(0) - 1 do
    for y in 0 .. ar.GetLength(1) - 1 do
      res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))

这大约需要100毫秒,因此可以让您了解分配的成本。但是,如果它可以破坏你的程序你不应该做这个改变,因为现在你正在使用可变数组......

答案 1 :(得分:2)

我对这个问题进行了一些测量,我认为这可能很有趣。

我创建了8个不同的测试用例并运行了3个不同大小的矩阵; 1000x1000,100x100和10x10。

此外,我还在x64和x86中运行了测试。

最后,我最终在两张图中显示了48个测试结果。 y轴是以毫秒为单位的执行时间。

x64 performance run

x86 performance run

  1. 创建零矩阵 - 创建零矩阵的成本
  2. 复制矩阵 - 使用Array2D.copy
  3. 复制矩阵的成本
  4. 使用id映射矩阵 - 使用Array2D.copy map id
  5. 复制矩阵的成本
  6. 原始算法 - OP发布的算法的成本
  7. Tomas Petricek算法 - Tomas算法的成本
  8. 修改过的Tomas Petricek算法 - 使用Array.zeroCreate的修改算法的成本
  9. 反向算法 - 反向迭代矩阵的成本
  10. 翻转x,y算法 - 修改算法的成本,但翻转x,y迭代顺序
  11. 一些观察

    1. Tomas想要证明与计算相比的副本成本,因此在他的例子中,副本不是内循环的一部分。我想要包含他的代码,所以我将副本移动到内部循环中以便能够与其他代码进行比较。修改后的Tomas算法是相同的代码,但使用Array2D.zeroCreate来创建新的矩阵。写这篇文章的时候,我意识到最好把它们都修改完毕。
    2. 在.NET 4.5.2上,x64在一般情况下做得更好
    3. 使用Array2D.zeroCreate并使用Array2D.copy
    4. 填充矩阵会带来性能优势
    5. 对于大矩阵,x,y迭代顺序非常重要。对于小矩阵,它并不重要。这是因为CPU缓存的工作原理
    6. 在数组上迭代逆序似乎给了一个小的好处。原因是检查y >= 0y < xl更便宜。
    7. 反向算法必须使用尾递归,因为F#for y = (yl - 1) downto 0使用y > variable_that_is_always_minus_1来检查循环结束。通过尾递归,我们可以强制y >= 0
    8. 对于较小尺寸的Matrix,创建它们的成本和GC的成本正在增加。
    9. 用于生成测量值的代码。

      open System
      open System.IO
      open System.Diagnostics
      
      let clock =
        let sw = Stopwatch ()
        sw.Start ()
        sw
      
      let collectionCount () = 
        GC.CollectionCount 0 + GC.CollectionCount 1 + GC.CollectionCount 2
      
      let timeIt (n : string) (outer : int) (a : unit -> 'T) : 'T*int64 =
        printfn "Timing '%s'..." n
      
        let v = a ()
      
        let t = clock.ElapsedMilliseconds
        for i in 1..outer do
          a () |> ignore
        let e = clock.ElapsedMilliseconds - t
      
        printfn "  took %d ms" e
      
        v, e
      
      [<EntryPoint>]
      let main argv =
        let random  = Random 19740531
        let total   = 100000000
        let outers  = [|100;10000;1000000|]
      
        use output = new StreamWriter ".\output.tsv"
        "Dimensions\tName\tSum\tCollectionCounts\tMilliseconds" |> output.WriteLine
      
        for outer in outers do
          let inner = total / outer
          let dim   = inner |> float |> sqrt |> int32
          let ar    = Array2D.init dim dim (fun _ _ -> random.NextDouble ())
      
          printfn "New test run, matrix dimensions are %dx%d" dim dim 
      
          let run = sprintf "%d_%d" dim dim
      
          let perf_zero () : float[,] = 
            let xl = ar.GetLength(0)
            let yl = ar.GetLength(1)
            let res = Array2D.zeroCreate xl yl
            res
      
          let perf_copy () : float[,] = 
            Array2D.copy ar
      
          let perf_id () : float[,] = 
            ar |> Array2D.map id
      
          let perf_op () : float[,] = 
            ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(coli + 6) * double(rowi + 7))
      
          let perf_tp () : float[,] =
            let res = ar |> Array2D.map id
            for x in 0 .. ar.GetLength(0) - 1 do
              for y in 0 .. ar.GetLength(1) - 1 do
                res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
            res
      
          let perf_tpm () : float[,] =
            let xl = ar.GetLength(0)
            let yl = ar.GetLength(1)
            let res = Array2D.zeroCreate xl yl
            for x in 0 .. xl - 1 do
              for y in 0 .. yl - 1 do
                res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
            res
      
          let perf_tpmf () : float[,] =
            let xl = ar.GetLength(0)
            let yl = ar.GetLength(1)
            let res = Array2D.zeroCreate xl yl
            for y in 0 .. yl - 1 do
              for x in 0 .. xl - 1 do
                res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
            res
      
          let perf_tr () : float[,] =
            let xl = ar.GetLength(0)
            let yl = ar.GetLength(1)
            let res = Array2D.zeroCreate xl yl
            let rec loopy x y =
              if y >= 0 then
                res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
                loopy x (y - 1)
              else
                ()
            and loopx x =
              if x >= 0 then
                loopy x (yl - 1)
                loopx (x - 1)
              else
                ()
            loopx (xl - 1)
            res
      
      
          let testCases =
            [|
              "Creating Zero Matrix"              , perf_zero
              "Copying Matrix"                    , perf_copy
              "Mapping Matrix with id"            , perf_id
              "Original Algorithm"                , perf_op
              "Tomas Petricek Algorithm"          , perf_tp 
              "Modified Tomas Petricek Algorithm" , perf_tpm
              "Reverse Algoritm"                  , perf_tr
              "Flipped x,y Algoritm"              , perf_tpmf
            |]
      
          for name, a in testCases do
            let pcc   = collectionCount ()
            let vs, t = timeIt name outer a
            let sum   = ref 0.
            vs |> Array2D.iter (fun v -> sum := !sum + v)
            let dcc   = collectionCount () - pcc
            sprintf "%s\t%s\t%f\t%d\t%d" run name !sum dcc t |> output.WriteLine
      
        0
      

答案 2 :(得分:1)

由于OP指出他的问题涉及像9x4这样的较小Matrix,我做了另一组指标。但是,由于我认为我以前的答案对尺寸较大的指标有一些有趣的观点,所以我决定创建一个新答案

我对这个问题进行了一些测量,我认为这可能很有趣。

我创建了9个不同的测试用例并在10x5矩阵上运行。所有测试都在Release(显然)/ x64中运行。

第一张图显示了以毫秒为单位的执行时间:

Execution time in milliseconds

第二张图显示了测试运行期间GC集合的数量:

GC collection counts during test run

  1. 创建零矩阵 - 创建零矩阵的成本
  2. 复制矩阵 - 使用Array2D.copy复制矩阵的成本
  3. 使用id映射矩阵 - 使用Array2D.copy map id复制矩阵的成本
  4. 原始算法 - OP发布的算法的成本
  5. 具有零启动的Tomas P算法 - Tomas使用Array2D.zeroInit
  6. 算法的成本
  7. 创建零固定大小矩阵 - 创建零固定大小矩阵的成本
  8. 复制固定大小矩阵 - 创建零固定大小矩阵的成本
  9. 固定大小算法 - 适用于固定大小矩阵的OP:s算法的成本
  10. 固定大小更新程序 - 使用更新程序功能的OP:s算法的成本
  11. 固定大小矩阵是struct,它使用unsafe代码来避免GC分配。它是用C#编写的,但可以移植到F#。它不应被视为具有生产价值的代码,更像是对自己创作的灵感。

    一些观察结果:

    1. 快速复制固定尺寸矩阵
    2. 固定大小算法的表现不如预期的好。可能是因为JIT:呃因为unsafe代码
    3. 而必须执行一些额外的提升
    4. 固定大小更新程序(类似于Array2D.iteri)具有最佳性能
    5. 正如预期的那样,固定尺寸矩阵不会产生任何GC压力,因为它不依赖于GC分配。
    6. 如果固定大小矩阵是OP的可行路径,我很难判断,但这是一个值得考虑的选项。

      F#代码:

      open System
      open System.IO
      open System.Diagnostics
      
      open Unsafe
      
      let clock =
        let sw = Stopwatch ()
        sw.Start ()
        sw
      
      let collectionCount () = 
        GC.CollectionCount 0 + GC.CollectionCount 1 + GC.CollectionCount 2
      
      let createTimer (n : string) (a : unit -> 'T) (r : 'T -> 'TResult) : string*(int -> 'TResult*int64*int) =
        n, fun outer ->
          printfn "Timing '%s'..." n
      
          let v = a () |> r
      
          GC.Collect ()
          GC.WaitForFullGCComplete () |> ignore
      
          let pcc   = collectionCount ()
          let t     = clock.ElapsedMilliseconds
      
          for i in 1..outer do
            a () |> ignore
      
          let e     = clock.ElapsedMilliseconds - t
          let dcc   = collectionCount () - pcc
      
          printfn "  took %d ms, collected %d times, result is %A" e dcc v
      
          v, e, dcc
      
      [<EntryPoint>]
      let main argv =
        let random  = Random 19740531
        let total   = 300000000
      
        use output = new StreamWriter ".\output.tsv"
        "Name\tSum\tCollectionCounts\tMilliseconds" |> output.WriteLine
      
        let cols    = 5
        let rows    = 10  
        let inner   = cols*rows
        let outer   = total / inner
        let ar      = Array2D.init rows cols (fun _ _ -> random.NextDouble ())
        let mtx5x10 = 
          let mutable m = Matrix5x10 ()
          ar |> Array2D.iteri (fun row col v -> (m.[col, row] <- v))
          m
      
        printfn "New test run, matrix dimensions are %dx%d" cols rows
      
        let perf_zero () = 
          let xl = ar.GetLength(0)
          let yl = ar.GetLength(1)
          let res = Array2D.zeroCreate xl yl
          res
      
        let perf_copy () = 
          Array2D.copy ar
      
        let perf_id () = 
          ar |> Array2D.map id
      
        let perf_op () = 
          ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(rowi + 6) * double(coli + 7))
      
        let perf_tpm () =
          let xl = ar.GetLength(0)
          let yl = ar.GetLength(1)
          let res = Array2D.zeroCreate xl yl
          for x in 0 .. xl - 1 do
            for y in 0 .. yl - 1 do
              res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
          res
      
        let perf_fzero () =
          let m = Matrix5x10()
          m
      
        let perf_fcopy () =
          let m = mtx5x10
          m
      
        let perf_fs () =
          let mutable m = Matrix5x10 ()
      
          for row = 0 to Matrix5x10.Rows - 1 do
            for col = 0 to Matrix5x10.Columns - 1 do
              m.[col, row] <- (mtx5x10.[col, row] + 1.6) * double ((row + 6) * (col + 7))
      
          m
      
        let perf_fsui = Func<int, int, double, double> (fun col row v -> (v + 1.6) * double ((row + 6) * (col + 7)))
      
        let perf_fsu () =
          let mutable m = mtx5x10
          m.Update perf_fsui
          m
      
        let sumArray vs =
          let sum   = ref 0.
          vs |> Array2D.iter (fun v -> sum := !sum + v)
          !sum
      
        let sumMatrix (mtx : Matrix5x10) =
          let sum   = ref 0.
          mtx.Update (fun _ _ v -> sum := !sum + v; v)
          !sum
      
        let testCases =
          [|
            createTimer "Creating Zero Matrix"              perf_zero   sumArray
            createTimer "Copying Matrix"                    perf_copy   sumArray
            createTimer "Mapping Matrix with id"            perf_id     sumArray
            createTimer "Original Algorithm"                perf_op     sumArray
            createTimer "Tomas P Algorithm with Zero Init"  perf_tpm    sumArray
            createTimer "Creating Zero Fixed Size Matrix"   perf_fzero  sumMatrix
            createTimer "Copying Fixed Size Matrix"         perf_fcopy  sumMatrix
            createTimer "Fixed Size Algorithm"              perf_fs     sumMatrix
            createTimer "Fixed Size Updater"                perf_fsu    sumMatrix
          |]
      
        for name, a in testCases do
          let sum, t, dcc = a outer
          sprintf "%s\t%f\t%d\t%d" name sum dcc t |> output.WriteLine
      
        0
      

      C#代码(对于那些关心我用T4生成的代码):

      namespace Unsafe
      {
        using System;
        using System.Diagnostics;
        using System.Runtime.CompilerServices;
        using System.Runtime.InteropServices;
      
        [StructLayout(LayoutKind.Sequential)]
        public struct Matrix5x10
        {
          double m_c0_r0;
          double m_c1_r0;
          double m_c2_r0;
          double m_c3_r0;
          double m_c4_r0;
          double m_c0_r1;
          double m_c1_r1;
          double m_c2_r1;
          double m_c3_r1;
          double m_c4_r1;
          double m_c0_r2;
          double m_c1_r2;
          double m_c2_r2;
          double m_c3_r2;
          double m_c4_r2;
          double m_c0_r3;
          double m_c1_r3;
          double m_c2_r3;
          double m_c3_r3;
          double m_c4_r3;
          double m_c0_r4;
          double m_c1_r4;
          double m_c2_r4;
          double m_c3_r4;
          double m_c4_r4;
          double m_c0_r5;
          double m_c1_r5;
          double m_c2_r5;
          double m_c3_r5;
          double m_c4_r5;
          double m_c0_r6;
          double m_c1_r6;
          double m_c2_r6;
          double m_c3_r6;
          double m_c4_r6;
          double m_c0_r7;
          double m_c1_r7;
          double m_c2_r7;
          double m_c3_r7;
          double m_c4_r7;
          double m_c0_r8;
          double m_c1_r8;
          double m_c2_r8;
          double m_c3_r8;
          double m_c4_r8;
          double m_c0_r9;
          double m_c1_r9;
          double m_c2_r9;
          double m_c3_r9;
          double m_c4_r9;
      
          public const int Columns  = 5;
          public const int Rows     = 10;
      
          unsafe public double this[int x, int y]
          {
      
            [MethodImpl (MethodImplOptions.AggressiveInlining)]
            get
            {
              var i = 5 * y + x;
      
              if (i < 0 || i >= 50)
              {
                throw new IndexOutOfRangeException ("0 <= x <= 5 && 0 <= y <= 10");
              }
      
              fixed (double * ms = &m_c0_r0)
              {
                return ms[i];
              }
            }
      
            [MethodImpl (MethodImplOptions.AggressiveInlining)]
            set
            {
              var i = 5 * y + x;
      
              if (i < 0 || i >= 50)
              {
                throw new IndexOutOfRangeException ("0 <= x <= 5 && 0 <= y <= 10");
              }
      
              fixed (double * ms = &m_c0_r0)
              {
                ms[i] = value;
              }
            }
          }
      
          public void Update (Func<int, int, double, double> updater)
          {
            if (updater == null)
            {
              return;
            }
            m_c0_r0 = updater (0, 0, m_c0_r0);
            m_c1_r0 = updater (1, 0, m_c1_r0);
            m_c2_r0 = updater (2, 0, m_c2_r0);
            m_c3_r0 = updater (3, 0, m_c3_r0);
            m_c4_r0 = updater (4, 0, m_c4_r0);
            m_c0_r1 = updater (0, 1, m_c0_r1);
            m_c1_r1 = updater (1, 1, m_c1_r1);
            m_c2_r1 = updater (2, 1, m_c2_r1);
            m_c3_r1 = updater (3, 1, m_c3_r1);
            m_c4_r1 = updater (4, 1, m_c4_r1);
            m_c0_r2 = updater (0, 2, m_c0_r2);
            m_c1_r2 = updater (1, 2, m_c1_r2);
            m_c2_r2 = updater (2, 2, m_c2_r2);
            m_c3_r2 = updater (3, 2, m_c3_r2);
            m_c4_r2 = updater (4, 2, m_c4_r2);
            m_c0_r3 = updater (0, 3, m_c0_r3);
            m_c1_r3 = updater (1, 3, m_c1_r3);
            m_c2_r3 = updater (2, 3, m_c2_r3);
            m_c3_r3 = updater (3, 3, m_c3_r3);
            m_c4_r3 = updater (4, 3, m_c4_r3);
            m_c0_r4 = updater (0, 4, m_c0_r4);
            m_c1_r4 = updater (1, 4, m_c1_r4);
            m_c2_r4 = updater (2, 4, m_c2_r4);
            m_c3_r4 = updater (3, 4, m_c3_r4);
            m_c4_r4 = updater (4, 4, m_c4_r4);
            m_c0_r5 = updater (0, 5, m_c0_r5);
            m_c1_r5 = updater (1, 5, m_c1_r5);
            m_c2_r5 = updater (2, 5, m_c2_r5);
            m_c3_r5 = updater (3, 5, m_c3_r5);
            m_c4_r5 = updater (4, 5, m_c4_r5);
            m_c0_r6 = updater (0, 6, m_c0_r6);
            m_c1_r6 = updater (1, 6, m_c1_r6);
            m_c2_r6 = updater (2, 6, m_c2_r6);
            m_c3_r6 = updater (3, 6, m_c3_r6);
            m_c4_r6 = updater (4, 6, m_c4_r6);
            m_c0_r7 = updater (0, 7, m_c0_r7);
            m_c1_r7 = updater (1, 7, m_c1_r7);
            m_c2_r7 = updater (2, 7, m_c2_r7);
            m_c3_r7 = updater (3, 7, m_c3_r7);
            m_c4_r7 = updater (4, 7, m_c4_r7);
            m_c0_r8 = updater (0, 8, m_c0_r8);
            m_c1_r8 = updater (1, 8, m_c1_r8);
            m_c2_r8 = updater (2, 8, m_c2_r8);
            m_c3_r8 = updater (3, 8, m_c3_r8);
            m_c4_r8 = updater (4, 8, m_c4_r8);
            m_c0_r9 = updater (0, 9, m_c0_r9);
            m_c1_r9 = updater (1, 9, m_c1_r9);
            m_c2_r9 = updater (2, 9, m_c2_r9);
            m_c3_r9 = updater (3, 9, m_c3_r9);
            m_c4_r9 = updater (4, 9, m_c4_r9);
          }
      
        }
      }