Question

如果没有第三方，处理这个2DArray会有什么更高效的方法？

#time
let ar = array2D[[5.0; 6.0; 7.0; 8.0]; [1.0; 2.0; 3.0; 4.0]]

[0..5000000]
let a2 = ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(coli + 6) * double(rowi + 7))

Answer 1

如果你运行上面的代码，它需要大约0ms，所以我真的取决于你调用它的上下文。如果你只是循环运行它1M次，那么我的机器需要大约600ms：

for i in 0 .. 1000000 do
  let a2 = ar |> Array2D.mapi(fun rowi coli value -> 
    (value + 1.6) * double ((coli + 6) * (rowi + 7)))
  ()

这里，大部分时间都花在分配结果数组上 - 对于每次迭代，我们需要分配一个新的2D数组来存储结果。这为您提供了很好的功能属性（结果可以共享，因为它们没有发生变异），但这就是它需要更长时间的原因。

你可以使用一些变异并避免这种情况。这取决于上下文，所以这就是为什么你可能在这里找不到有用的答案。

例如，在这个仿真的1M循环示例中，我可以只分配一个数组来存储结果，然后重复写入：

let res = ar |> Array2D.map id
for i in 0 .. 1000000 do
  for x in 0 .. ar.GetLength(0) - 1 do
    for y in 0 .. ar.GetLength(1) - 1 do
      res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))

这大约需要100毫秒，因此可以让您了解分配的成本。但是，如果它可以破坏你的程序你不应该做这个改变，因为现在你正在使用可变数组......

Answer 2

我对这个问题进行了一些测量，我认为这可能很有趣。

我创建了8个不同的测试用例并运行了3个不同大小的矩阵; 1000x1000,100x100和10x10。

此外，我还在x64和x86中运行了测试。

最后，我最终在两张图中显示了48个测试结果。 y轴是以毫秒为单位的执行时间。

创建零矩阵 - 创建零矩阵的成本
复制矩阵 - 使用Array2D.copy
使用id映射矩阵 - 使用Array2D.copy map id
原始算法 - OP发布的算法的成本
Tomas Petricek算法 - Tomas算法的成本
修改过的Tomas Petricek算法 - 使用Array.zeroCreate的修改算法的成本
反向算法 - 反向迭代矩阵的成本
翻转x，y算法 - 修改算法的成本，但翻转x，y迭代顺序

一些观察

Tomas想要证明与计算相比的副本成本，因此在他的例子中，副本不是内循环的一部分。我想要包含他的代码，所以我将副本移动到内部循环中以便能够与其他代码进行比较。修改后的Tomas算法是相同的代码，但使用Array2D.zeroCreate来创建新的矩阵。写这篇文章的时候，我意识到最好把它们都修改完毕。
在.NET 4.5.2上，x64在一般情况下做得更好
使用Array2D.zeroCreate并使用Array2D.copy
对于大矩阵，x，y迭代顺序非常重要。对于小矩阵，它并不重要。这是因为CPU缓存的工作原理
在数组上迭代逆序似乎给了一个小的好处。原因是检查y >= 0比y < xl更便宜。
反向算法必须使用尾递归，因为F＃for y = (yl - 1) downto 0使用y > variable_that_is_always_minus_1来检查循环结束。通过尾递归，我们可以强制y >= 0
对于较小尺寸的Matrix，创建它们的成本和GC的成本正在增加。

用于生成测量值的代码。

open System
open System.IO
open System.Diagnostics

let clock =
  let sw = Stopwatch ()
  sw.Start ()
  sw

let collectionCount () = 
  GC.CollectionCount 0 + GC.CollectionCount 1 + GC.CollectionCount 2

let timeIt (n : string) (outer : int) (a : unit -> 'T) : 'T*int64 =
  printfn "Timing '%s'..." n

  let v = a ()

  let t = clock.ElapsedMilliseconds
  for i in 1..outer do
    a () |> ignore
  let e = clock.ElapsedMilliseconds - t

  printfn "  took %d ms" e

  v, e

[<EntryPoint>]
let main argv =
  let random  = Random 19740531
  let total   = 100000000
  let outers  = [|100;10000;1000000|]

  use output = new StreamWriter ".\output.tsv"
  "Dimensions\tName\tSum\tCollectionCounts\tMilliseconds" |> output.WriteLine

  for outer in outers do
    let inner = total / outer
    let dim   = inner |> float |> sqrt |> int32
    let ar    = Array2D.init dim dim (fun _ _ -> random.NextDouble ())

    printfn "New test run, matrix dimensions are %dx%d" dim dim 

    let run = sprintf "%d_%d" dim dim

    let perf_zero () : float[,] = 
      let xl = ar.GetLength(0)
      let yl = ar.GetLength(1)
      let res = Array2D.zeroCreate xl yl
      res

    let perf_copy () : float[,] = 
      Array2D.copy ar

    let perf_id () : float[,] = 
      ar |> Array2D.map id

    let perf_op () : float[,] = 
      ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(coli + 6) * double(rowi + 7))

    let perf_tp () : float[,] =
      let res = ar |> Array2D.map id
      for x in 0 .. ar.GetLength(0) - 1 do
        for y in 0 .. ar.GetLength(1) - 1 do
          res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
      res

    let perf_tpm () : float[,] =
      let xl = ar.GetLength(0)
      let yl = ar.GetLength(1)
      let res = Array2D.zeroCreate xl yl
      for x in 0 .. xl - 1 do
        for y in 0 .. yl - 1 do
          res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
      res

    let perf_tpmf () : float[,] =
      let xl = ar.GetLength(0)
      let yl = ar.GetLength(1)
      let res = Array2D.zeroCreate xl yl
      for y in 0 .. yl - 1 do
        for x in 0 .. xl - 1 do
          res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
      res

    let perf_tr () : float[,] =
      let xl = ar.GetLength(0)
      let yl = ar.GetLength(1)
      let res = Array2D.zeroCreate xl yl
      let rec loopy x y =
        if y >= 0 then
          res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
          loopy x (y - 1)
        else
          ()
      and loopx x =
        if x >= 0 then
          loopy x (yl - 1)
          loopx (x - 1)
        else
          ()
      loopx (xl - 1)
      res


    let testCases =
      [|
        "Creating Zero Matrix"              , perf_zero
        "Copying Matrix"                    , perf_copy
        "Mapping Matrix with id"            , perf_id
        "Original Algorithm"                , perf_op
        "Tomas Petricek Algorithm"          , perf_tp 
        "Modified Tomas Petricek Algorithm" , perf_tpm
        "Reverse Algoritm"                  , perf_tr
        "Flipped x,y Algoritm"              , perf_tpmf
      |]

    for name, a in testCases do
      let pcc   = collectionCount ()
      let vs, t = timeIt name outer a
      let sum   = ref 0.
      vs |> Array2D.iter (fun v -> sum := !sum + v)
      let dcc   = collectionCount () - pcc
      sprintf "%s\t%s\t%f\t%d\t%d" run name !sum dcc t |> output.WriteLine

  0

Answer 3

由于OP指出他的问题涉及像9x4这样的较小Matrix，我做了另一组指标。但是，由于我认为我以前的答案对尺寸较大的指标有一些有趣的观点，所以我决定创建一个新答案

我对这个问题进行了一些测量，我认为这可能很有趣。

我创建了9个不同的测试用例并在10x5矩阵上运行。所有测试都在Release（显然）/ x64中运行。

第一张图显示了以毫秒为单位的执行时间：

第二张图显示了测试运行期间GC集合的数量：

创建零矩阵 - 创建零矩阵的成本
复制矩阵 - 使用Array2D.copy复制矩阵的成本
使用id映射矩阵 - 使用Array2D.copy map id复制矩阵的成本
原始算法 - OP发布的算法的成本
具有零启动的Tomas P算法 - Tomas使用Array2D.zeroInit
创建零固定大小矩阵 - 创建零固定大小矩阵的成本
复制固定大小矩阵 - 创建零固定大小矩阵的成本
固定大小算法 - 适用于固定大小矩阵的OP：s算法的成本
固定大小更新程序 - 使用更新程序功能的OP：s算法的成本

固定大小矩阵是struct，它使用unsafe代码来避免GC分配。它是用C＃编写的，但可以移植到F＃。它不应被视为具有生产价值的代码，更像是对自己创作的灵感。

一些观察结果：

快速复制固定尺寸矩阵
固定大小算法的表现不如预期的好。可能是因为JIT：呃因为unsafe代码
固定大小更新程序（类似于Array2D.iteri）具有最佳性能
正如预期的那样，固定尺寸矩阵不会产生任何GC压力，因为它不依赖于GC分配。

如果固定大小矩阵是OP的可行路径，我很难判断，但这是一个值得考虑的选项。

F＃代码：

open System
open System.IO
open System.Diagnostics

open Unsafe

let clock =
  let sw = Stopwatch ()
  sw.Start ()
  sw

let collectionCount () = 
  GC.CollectionCount 0 + GC.CollectionCount 1 + GC.CollectionCount 2

let createTimer (n : string) (a : unit -> 'T) (r : 'T -> 'TResult) : string*(int -> 'TResult*int64*int) =
  n, fun outer ->
    printfn "Timing '%s'..." n

    let v = a () |> r

    GC.Collect ()
    GC.WaitForFullGCComplete () |> ignore

    let pcc   = collectionCount ()
    let t     = clock.ElapsedMilliseconds

    for i in 1..outer do
      a () |> ignore

    let e     = clock.ElapsedMilliseconds - t
    let dcc   = collectionCount () - pcc

    printfn "  took %d ms, collected %d times, result is %A" e dcc v

    v, e, dcc

[<EntryPoint>]
let main argv =
  let random  = Random 19740531
  let total   = 300000000

  use output = new StreamWriter ".\output.tsv"
  "Name\tSum\tCollectionCounts\tMilliseconds" |> output.WriteLine

  let cols    = 5
  let rows    = 10  
  let inner   = cols*rows
  let outer   = total / inner
  let ar      = Array2D.init rows cols (fun _ _ -> random.NextDouble ())
  let mtx5x10 = 
    let mutable m = Matrix5x10 ()
    ar |> Array2D.iteri (fun row col v -> (m.[col, row] <- v))
    m

  printfn "New test run, matrix dimensions are %dx%d" cols rows

  let perf_zero () = 
    let xl = ar.GetLength(0)
    let yl = ar.GetLength(1)
    let res = Array2D.zeroCreate xl yl
    res

  let perf_copy () = 
    Array2D.copy ar

  let perf_id () = 
    ar |> Array2D.map id

  let perf_op () = 
    ar |> Array2D.mapi(fun rowi coli value -> (value + 1.6) * double(rowi + 6) * double(coli + 7))

  let perf_tpm () =
    let xl = ar.GetLength(0)
    let yl = ar.GetLength(1)
    let res = Array2D.zeroCreate xl yl
    for x in 0 .. xl - 1 do
      for y in 0 .. yl - 1 do
        res.[x, y] <- (ar.[x, y] + 1.6) * double ((x + 6) * (y + 7))
    res

  let perf_fzero () =
    let m = Matrix5x10()
    m

  let perf_fcopy () =
    let m = mtx5x10
    m

  let perf_fs () =
    let mutable m = Matrix5x10 ()

    for row = 0 to Matrix5x10.Rows - 1 do
      for col = 0 to Matrix5x10.Columns - 1 do
        m.[col, row] <- (mtx5x10.[col, row] + 1.6) * double ((row + 6) * (col + 7))

    m

  let perf_fsui = Func<int, int, double, double> (fun col row v -> (v + 1.6) * double ((row + 6) * (col + 7)))

  let perf_fsu () =
    let mutable m = mtx5x10
    m.Update perf_fsui
    m

  let sumArray vs =
    let sum   = ref 0.
    vs |> Array2D.iter (fun v -> sum := !sum + v)
    !sum

  let sumMatrix (mtx : Matrix5x10) =
    let sum   = ref 0.
    mtx.Update (fun _ _ v -> sum := !sum + v; v)
    !sum

  let testCases =
    [|
      createTimer "Creating Zero Matrix"              perf_zero   sumArray
      createTimer "Copying Matrix"                    perf_copy   sumArray
      createTimer "Mapping Matrix with id"            perf_id     sumArray
      createTimer "Original Algorithm"                perf_op     sumArray
      createTimer "Tomas P Algorithm with Zero Init"  perf_tpm    sumArray
      createTimer "Creating Zero Fixed Size Matrix"   perf_fzero  sumMatrix
      createTimer "Copying Fixed Size Matrix"         perf_fcopy  sumMatrix
      createTimer "Fixed Size Algorithm"              perf_fs     sumMatrix
      createTimer "Fixed Size Updater"                perf_fsu    sumMatrix
    |]

  for name, a in testCases do
    let sum, t, dcc = a outer
    sprintf "%s\t%f\t%d\t%d" name sum dcc t |> output.WriteLine

  0

C＃代码（对于那些关心我用T4生成的代码）：

namespace Unsafe
{
  using System;
  using System.Diagnostics;
  using System.Runtime.CompilerServices;
  using System.Runtime.InteropServices;

  [StructLayout(LayoutKind.Sequential)]
  public struct Matrix5x10
  {
    double m_c0_r0;
    double m_c1_r0;
    double m_c2_r0;
    double m_c3_r0;
    double m_c4_r0;
    double m_c0_r1;
    double m_c1_r1;
    double m_c2_r1;
    double m_c3_r1;
    double m_c4_r1;
    double m_c0_r2;
    double m_c1_r2;
    double m_c2_r2;
    double m_c3_r2;
    double m_c4_r2;
    double m_c0_r3;
    double m_c1_r3;
    double m_c2_r3;
    double m_c3_r3;
    double m_c4_r3;
    double m_c0_r4;
    double m_c1_r4;
    double m_c2_r4;
    double m_c3_r4;
    double m_c4_r4;
    double m_c0_r5;
    double m_c1_r5;
    double m_c2_r5;
    double m_c3_r5;
    double m_c4_r5;
    double m_c0_r6;
    double m_c1_r6;
    double m_c2_r6;
    double m_c3_r6;
    double m_c4_r6;
    double m_c0_r7;
    double m_c1_r7;
    double m_c2_r7;
    double m_c3_r7;
    double m_c4_r7;
    double m_c0_r8;
    double m_c1_r8;
    double m_c2_r8;
    double m_c3_r8;
    double m_c4_r8;
    double m_c0_r9;
    double m_c1_r9;
    double m_c2_r9;
    double m_c3_r9;
    double m_c4_r9;

    public const int Columns  = 5;
    public const int Rows     = 10;

    unsafe public double this[int x, int y]
    {

      [MethodImpl (MethodImplOptions.AggressiveInlining)]
      get
      {
        var i = 5 * y + x;

        if (i < 0 || i >= 50)
        {
          throw new IndexOutOfRangeException ("0 <= x <= 5 && 0 <= y <= 10");
        }

        fixed (double * ms = &m_c0_r0)
        {
          return ms[i];
        }
      }

      [MethodImpl (MethodImplOptions.AggressiveInlining)]
      set
      {
        var i = 5 * y + x;

        if (i < 0 || i >= 50)
        {
          throw new IndexOutOfRangeException ("0 <= x <= 5 && 0 <= y <= 10");
        }

        fixed (double * ms = &m_c0_r0)
        {
          ms[i] = value;
        }
      }
    }

    public void Update (Func<int, int, double, double> updater)
    {
      if (updater == null)
      {
        return;
      }
      m_c0_r0 = updater (0, 0, m_c0_r0);
      m_c1_r0 = updater (1, 0, m_c1_r0);
      m_c2_r0 = updater (2, 0, m_c2_r0);
      m_c3_r0 = updater (3, 0, m_c3_r0);
      m_c4_r0 = updater (4, 0, m_c4_r0);
      m_c0_r1 = updater (0, 1, m_c0_r1);
      m_c1_r1 = updater (1, 1, m_c1_r1);
      m_c2_r1 = updater (2, 1, m_c2_r1);
      m_c3_r1 = updater (3, 1, m_c3_r1);
      m_c4_r1 = updater (4, 1, m_c4_r1);
      m_c0_r2 = updater (0, 2, m_c0_r2);
      m_c1_r2 = updater (1, 2, m_c1_r2);
      m_c2_r2 = updater (2, 2, m_c2_r2);
      m_c3_r2 = updater (3, 2, m_c3_r2);
      m_c4_r2 = updater (4, 2, m_c4_r2);
      m_c0_r3 = updater (0, 3, m_c0_r3);
      m_c1_r3 = updater (1, 3, m_c1_r3);
      m_c2_r3 = updater (2, 3, m_c2_r3);
      m_c3_r3 = updater (3, 3, m_c3_r3);
      m_c4_r3 = updater (4, 3, m_c4_r3);
      m_c0_r4 = updater (0, 4, m_c0_r4);
      m_c1_r4 = updater (1, 4, m_c1_r4);
      m_c2_r4 = updater (2, 4, m_c2_r4);
      m_c3_r4 = updater (3, 4, m_c3_r4);
      m_c4_r4 = updater (4, 4, m_c4_r4);
      m_c0_r5 = updater (0, 5, m_c0_r5);
      m_c1_r5 = updater (1, 5, m_c1_r5);
      m_c2_r5 = updater (2, 5, m_c2_r5);
      m_c3_r5 = updater (3, 5, m_c3_r5);
      m_c4_r5 = updater (4, 5, m_c4_r5);
      m_c0_r6 = updater (0, 6, m_c0_r6);
      m_c1_r6 = updater (1, 6, m_c1_r6);
      m_c2_r6 = updater (2, 6, m_c2_r6);
      m_c3_r6 = updater (3, 6, m_c3_r6);
      m_c4_r6 = updater (4, 6, m_c4_r6);
      m_c0_r7 = updater (0, 7, m_c0_r7);
      m_c1_r7 = updater (1, 7, m_c1_r7);
      m_c2_r7 = updater (2, 7, m_c2_r7);
      m_c3_r7 = updater (3, 7, m_c3_r7);
      m_c4_r7 = updater (4, 7, m_c4_r7);
      m_c0_r8 = updater (0, 8, m_c0_r8);
      m_c1_r8 = updater (1, 8, m_c1_r8);
      m_c2_r8 = updater (2, 8, m_c2_r8);
      m_c3_r8 = updater (3, 8, m_c3_r8);
      m_c4_r8 = updater (4, 8, m_c4_r8);
      m_c0_r9 = updater (0, 9, m_c0_r9);
      m_c1_r9 = updater (1, 9, m_c1_r9);
      m_c2_r9 = updater (2, 9, m_c2_r9);
      m_c3_r9 = updater (3, 9, m_c3_r9);
      m_c4_r9 = updater (4, 9, m_c4_r9);
    }

  }
}

映射2DArray时F＃性能更好 - ＆gt; arraymodule.mapindexed

3 个答案: