
时间:2017-06-01 00:44:54

标签: asynchronous parallel-processing f#


let chunkList chunkSize (xs : list<'T>) = 
    query {
        for idx in 0..(xs.Length - 1) do
        groupBy (idx / chunkSize) into g
        select (g |> Seq.map (fun idx -> xs.[idx]))
let par (foo: 'T -> 'S) (xs: list<'T>) = 
    |> List.map (fun x -> async { return foo x })
    |> Async.Parallel
    |> Async.RunSynchronously
    |> Array.toList

let parChunks chunkSize (f: 'T -> 'S) (xs: list<'T>) =
    chunkList chunkSize xs |> Seq.map List.ofSeq |> List.ofSeq
    |> par (List.map f)
    |> List.concat


let g i = [1..1000000] |> List.map (fun x -> sqrt (float (1000 * x + 1))) |> List.head



List.map g [1..100] ;;   // Real:00:00:28.979,CPU:00:00:29.562


parChunks 50 g [1..100] ;;   // Real:00:00:23.027,CPU:00:00:24.687

但是,如果块大小等于列表大小的1/4,则性能几乎相同。我没想到这一点,因为我的处理器(Intel 6700HQ)有四个核心。


parChunks 25 g [1..100] ;;   // Real:00:00:21.695,CPU:00:00:24.437

查看Performance中的Task Manager应用,可以看到四个内核从未使用过。


3 个答案:

答案 0 :(得分:5)




使用Array.Parallel s而不是Array s上的List模块可以更轻松地实现您的目标。

let g i = 
    |> Array.Parallel.map (fun x -> sqrt (float (1000 * x + 1))) 
    |> Array.head


答案 1 :(得分:3)

在F#中,async个工作流使用the .Net ThreadPool class运行,https://stackoverflow.com/a/26041852/2314532具有GetMinThreadsGetMaxThreads方法。它们使用两个out参数来返回允许线程池使用的最小或最大线程数,但在F#中转换为返回元组的函数:

F# Interactive for F# 4.1
Freely distributed under the Apache 2.0 Open Source License

For help type #help;;

> open System.Threading ;;
> ThreadPool.GetMinThreads() ;;
val it : int * int = (4, 4)

> ThreadPool.GetMaxThreads() ;;
val it : int * int = (400, 200)

这两个数字分别用于“工作”线程和“异步I / O”线程。我的CPU有四个核心,因此池中两种线程的最小数量是4.我不确定这是你的问题,但尝试在你的系统上运行ThreadPool.GetMinThreads()并确保它是4.如果由于某种原因它是2,这可以解释为什么你没有获得更好的表现。

另请参阅RRB Tree,了解使用async工作流进行并行处理可能导致的其他性能问题。这也可能就是这里发生的事情。


该问题的答案是对您计划并行处理的任何项目列表使用不同的数据结构(如this GitHub issue):它旨在有效地拆分和连接(实际上是O(1) )分裂和连接,虽然连接中的常数因子相当大)。不幸的是,目前F#中没有RRB树的实现。我目前正在研究一个,并估计它可能会在另一个月左右准备好。如果您想知道我何时发布了我一直在处理的代码,您可以订阅this

答案 2 :(得分:2)


对于性能一般而言,我们希望避免动态分配,因为我们不想浪费宝贵的周期来分配对象(在.NET中速度很快,在C / C ++中很慢)或者收集它们(非常慢)。 / p>





module SequentialFold =
  let compute (vs : float []) : float =
    vs |> Array.fold (fun s v -> s + sqrt (1000. * v + 1.)) 0. 




open System
open System.Threading.Tasks

let clock =
  let sw = System.Diagnostics.Stopwatch ()
  sw.Start ()
  fun () -> sw.ElapsedMilliseconds

let timeIt n a = 
  let r                 = a ()  // Warm-up

  GC.Collect (2, GCCollectionMode.Forced, true)

  let inline cc g       = GC.CollectionCount g
  let bcc0, bcc1, bcc2  = cc 0, cc 1, cc 2
  let before            = clock ()

  for i = 1 to n do
    a () |> ignore

  let after             = clock ()
  let acc0, acc1, acc2  = cc 0, cc 1, cc 2

  after - before, acc0 - bcc0, acc1 - bcc1, acc2 - bcc2, r

// compute implemented using tail recursion
module TailRecursion =
  let compute (vs : float []) : float =
    let rec loop s i =
      if i < vs.Length then
        let v = vs.[i]
        loop (s + sqrt (1000. * v + 1.)) (i + 1)
    loop 0. 0

// compute implemented using Array.fold
module SequentialFold =
  let compute (vs : float []) : float =
    vs |> Array.fold (fun s v -> s + sqrt (1000. * v + 1.)) 0. 

// compute implemented using Array.map + Array.fold
module SequentialArray =
  let compute (vs : float []) : float =
    vs |> Array.map (fun v -> sqrt (1000. * v + 1.)) |> Array.fold (+) 0. 

// compute implemented using Array.Parallel.map + Array.fold
module ParallelArray =
  let compute (vs : float []) : float =
    vs |> Array.Parallel.map (fun v -> sqrt (1000. * v + 1.)) |> Array.fold (+) 0. 

// compute implemented using Parallel.For
module ParallelFor =
  let compute (vs : float []) : float =
    let lockObj         = obj ()
    let mutable sum     = 0.
    let options         = ParallelOptions()
    let init ()         = 0.
    let body i pls s    =
      let v = i |> float
      s + sqrt (1000. * v + 1.)
    let localFinally ls =
      lock lockObj <| fun () -> sum <- sum + ls
    let pls = Parallel.For  (                                             0
                            ,                                             vs.Length
                            ,                                             options
                            , Func<float>                                 init          
                            , Func<int, ParallelLoopState, float, float>  body          
                            , Action<float>                               localFinally  

// compute implemented using Parallel.For with batches of size 100
module ParallelForBatched =
  let compute (vs : float []) : float =
    let inner           = 100
    let outer           = vs.Length / inner + (if vs.Length % inner = 0 then 0 else 1)
    let lockObj         = obj ()
    let mutable sum     = 0.
    let options         = ParallelOptions()
    let init ()         = 0.
    let rec loop e s i  =
      if i < e then
        let v = vs.[i]
        loop e (s + sqrt (1000. * v + 1.)) (i + 1)
    let body i pls s    =
      let b = i * inner
      let e = b + inner |> min vs.Length
      loop e s b
    let localFinally ls =
      lock lockObj <| fun () -> sum <- sum + ls
    let pls = Parallel.For  (                                             0
                            ,                                             outer
                            ,                                             options
                            , Func<float>                                 init          
                            , Func<int, ParallelLoopState, float, float>  body          
                            , Action<float>                               localFinally  

let main argv =
  let count   = 100000000
  let outers  =

  for outer in outers do
    let inner     = count / outer
    let vs        = Array.init inner float
    let testCases = 
        "TailRecursion"         , fun ()  -> TailRecursion.compute    vs 
        "Fold.Sequential"       , fun ()  -> SequentialFold.compute   vs
        "Array.Sequential"      , fun ()  -> SequentialArray.compute  vs
        "Array.Parallel"    , fun ()  -> ParallelArray.compute    vs
        "Parallel.For"          , fun ()  -> ParallelFor.compute      vs
        "Parallel.For.Batched"  , fun ()  -> ParallelForBatched.compute      vs
    printfn "Using outer = %A, inner = %A, total is: %A" outer inner count
    for nm, a in testCases do
      printfn "  Running test case: %A" nm
      let tm, cc0, cc1, cc2, r = timeIt outer a
      printfn "   it took %A ms with GC collects (%A, %A, %A), result is: %A" tm cc0 cc1 cc2 r

以下是结果(Intel I5,4核):

Using outer = 100000, inner = 1000, total is: 100000000
  Running test case: "TailRecursion"
   it took 389L ms with GC collects (0, 0, 0), result is: 666162.111
  Running test case: "Fold.Sequential"
   it took 388L ms with GC collects (0, 0, 0), result is: 666162.111
  Running test case: "Array.Sequential"
   it took 628L ms with GC collects (255, 0, 0), result is: 666162.111
  Running test case: "Array.Parallel"
   it took 993L ms with GC collects (306, 2, 0), result is: 666162.111
  Running test case: "Parallel.For"
   it took 711L ms with GC collects (54, 2, 0), result is: 666162.111
  Running test case: "Parallel.For.Batched"
   it took 490L ms with GC collects (52, 2, 0), result is: 666162.111
Using outer = 1000, inner = 100000, total is: 100000000
  Running test case: "TailRecursion"
   it took 389L ms with GC collects (0, 0, 0), result is: 666661671.1
  Running test case: "Fold.Sequential"
   it took 388L ms with GC collects (0, 0, 0), result is: 666661671.1
  Running test case: "Array.Sequential"
   it took 738L ms with GC collects (249, 249, 249), result is: 666661671.1
  Running test case: "Array.Parallel"
   it took 565L ms with GC collects (249, 249, 249), result is: 666661671.1
  Running test case: "Parallel.For"
   it took 157L ms with GC collects (0, 0, 0), result is: 666661671.1
  Running test case: "Parallel.For.Batched"
   it took 110L ms with GC collects (0, 0, 0), result is: 666661671.1
Using outer = 10, inner = 10000000, total is: 100000000
  Running test case: "TailRecursion"
   it took 387L ms with GC collects (0, 0, 0), result is: 6.666666168e+11
  Running test case: "Fold.Sequential"
   it took 390L ms with GC collects (0, 0, 0), result is: 6.666666168e+11
  Running test case: "Array.Sequential"
   it took 811L ms with GC collects (3, 3, 3), result is: 6.666666168e+11
  Running test case: "Array.Parallel"
   it took 567L ms with GC collects (4, 4, 4), result is: 6.666666168e+11
  Running test case: "Parallel.For"
   it took 151L ms with GC collects (0, 0, 0), result is: 6.666666168e+11
  Running test case: "Parallel.For.Batched"
   it took 102L ms with GC collects (0, 0, 0), result is: 6.666666168e+11







