F#:按重复的元素序列进行分组

时间:2016-07-15 11:58:36

标签: f# seq

我有一系列对(键,值),如

[("a", 1), ("a", 2), ("a", 111), ("b", 3), ("bb", 1), ("bb", -1), ...]

,将它转换为像

这样的序列的最有效方法是什么
[("a", [1,2,111]), ("b", [3]), ("bb", [1,-1])] 

或类似?

序列具有以下属性:它非常大(> 2Gb)

这使得Seq.groupBy真的无效且不正确,有没有其他方法可以做到这一点?

P.S。:这个序列:

[("a", 1), ("a", 2), ("a", 111), ("bb", 1), ("bb", -1), ("a", 5), ("a", 6), ...]

应转换为

[("a", [1,2,111]), ("bb", [1,-1]), ("a", [5,6]), ...]

-

编辑#1:修正了不正确的样本

编辑#2:序列很大,所以懒惰(或最快)解决方案是首选

4 个答案:

答案 0 :(得分:3)

如果你想让选项得到懒惰的结果,那么我认为没有保持可变状态的优雅方式。这是一个相对直接的突变。您维护一个存储的最后一个键,以及与之对应的所有值:

let s = [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5); ("a", 6)]
let s2 = 
    [
        let mutable prevKey = None
        let mutable values = System.Collections.Generic.List<_>()
        let init key value = 
            prevKey <- Some key
            values.Clear()
            values.Add value
        for (key, value) in s do
            match prevKey with
            | None -> init key value
            | Some k when k = key -> values.Add value
            | Some k -> 
                yield (k, List.ofSeq values)
                init key value
        match prevKey with
        | Some k -> yield (k, List.ofSeq values)
        | _ -> ()
    ]

这给出了:

val s2 : (string * int list) list =
  [("a", [1; 2; 111]); ("bb", [1; -1]); ("a", [5; 6])]

对于延迟评估,请将[ ... ]替换为seq { ... }

答案 1 :(得分:2)

一种简单的递归方法,没有可变状态。

let rec chunk inseq (accumelem,accumlist) = 
    match inseq with
    |(a,b)::c -> 
        match accumelem with
        |Some(t) -> if t=a then chunk c (accumelem,b::accumlist) else (t,accumlist)::(chunk c (Some(a),b::[]))
        |None -> chunk c (Some a,b::[])
    |[] ->         
        match accumelem with
        |Some(t) -> (t,accumlist)::[]
        |None -> []


chunk [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5);("a", 6)] (None,[])

val it : (string * int list) list =
     [("a", [111; 2; 1]); ("bb", [-1; 1]); ("a", [6; 5])]

答案 2 :(得分:2)

这是一个递归解决方案:

let test = [("a", 1); ("a", 2); ("a", 111); ("bb", 1); ("bb", -1); ("a", 5); ("a", 6)]

let groupByAdjacentElements alist = 
    let rec group a groupAcc prevElement adjacentAcc =
        match a with
        | [] -> match adjacentAcc with
                | [] -> groupAcc
                | _ -> (prevElement, List.rev adjacentAcc)::groupAcc
        | (b, c)::tail -> if b = prevElement then
                             group tail groupAcc prevElement (c::adjacentAcc)
                          else
                             group tail ((prevElement, List.rev adjacentAcc)::groupAcc) b [c]

    group alist [] (fst alist.Head) []
    |> List.rev

let b = groupByAdjacentElements test

返回:[("a", [1; 2; 111]); ("bb", [1; -1]); ("a", [5; 6])]

如果您想进行延迟评估,则应考虑尝试LazyList

编辑:这是一个将ExtCore的LazyList与接受的解决方案进行比较的脚本。它会生成一个大文本文件,然后进行转换。请注意,LazyList以相反的顺序返回:

open System.Diagnostics
open System.IO
open ExtCore

let fileName = "Test.txt"
let outFile = new StreamWriter(fileName)
for i in [1..20000*300] do
    outFile.WriteLine("a,1")
    outFile.WriteLine("a,2")
    outFile.WriteLine("a,111")
    outFile.WriteLine("bb,1")
    outFile.WriteLine("bb,-1")
    outFile.WriteLine("a,5")
    outFile.WriteLine("a,6")
    outFile.WriteLine("c,8")
outFile.Close()

printfn "Finished Writing to File"

let data = System.IO.File.ReadLines(fileName) 
            |> Seq.map (fun i -> let parts = i.Split(',')
                                 (parts.[0], parts.[1]))
printfn "Finished Reading File"

let s2 data = 
    [
        let mutable prevKey = None
        let mutable values = System.Collections.Generic.List<_>()
        let init key value = 
            prevKey <- Some key
            values.Clear()
            values.Add value
        for (key, value) in data do
            match prevKey with
            | None -> init key value
            | Some k when k = key -> values.Add value
            | Some k -> 
                yield (k, List.ofSeq values)
                init key value
        match prevKey with
        | Some key -> yield (key, List.ofSeq values)
        | _ -> ()
    ]

let groupByAdjacentElements aseq = 
    let alist = LazyList.ofSeq aseq
    let rec group alist groupAcc prevElement adjacentAcc =
        match alist with
        | Cons((b, c), tail) -> 
            if b = prevElement then
                group tail groupAcc prevElement (c::adjacentAcc)
            else
                group tail (LazyList.consDelayed (prevElement, List.rev adjacentAcc) (fun () -> groupAcc)) b [c]
        | Nil -> 
            match adjacentAcc with
            | [] -> groupAcc
            | _ -> LazyList.consDelayed (prevElement, List.rev adjacentAcc) (fun () -> groupAcc)


    group alist LazyList.empty (fst (alist.Head())) []

let groupByAdjacentElementsList aseq = 
    let alist = aseq |> Seq.toList
    let rec group a groupAcc prevElement adjacentAcc =
        match a with
        | [] -> match adjacentAcc with
                | [] -> groupAcc
                | _ -> (prevElement, List.rev adjacentAcc)::groupAcc
        | (b, c)::tail -> if b = prevElement then
                             group tail groupAcc prevElement (c::adjacentAcc)
                          else
                             group tail ((prevElement, List.rev adjacentAcc)::groupAcc) b [c]

    group alist [] (fst alist.Head) []
    |> List.rev

[<EntryPoint>]
let main argv =
    let stopwatch = new Stopwatch()
    stopwatch.Start()
    let b = s2 data
    printfn "The result is: %A" b
    stopwatch.Stop()
    printfn "It took %A ms." stopwatch.ElapsedMilliseconds
    System.GC.WaitForFullGCComplete() |> ignore
    stopwatch.Reset()
    stopwatch.Start()
    let b = groupByAdjacentElements data
    printfn "The result is: %A" b
    stopwatch.Stop()
    printfn "It took %A ms." stopwatch.ElapsedMilliseconds
    System.GC.WaitForFullGCComplete() |> ignore
    stopwatch.Reset()
    stopwatch.Start()
    let b = groupByAdjacentElementsList data
    printfn "The result is: %A" b
    stopwatch.Stop()
    printfn "It took %A ms." stopwatch.ElapsedMilliseconds
    0

当我使用大小约为300MB的文件时,LazyListseq解决方案略慢(83s到94s)。也就是说,与序列解决方案不同,LazyList的主要优势在于迭代迭代它。正常的列表解决方案比使用List.rev(没有大约73秒)时更快。

答案 3 :(得分:0)

也可以在没有可变绑定的情况下完成相邻键的分组。使用Seq.scan,可以生成具有急切块的延迟序列。它已经提供了一个特殊情况,即序列的第一个元素;通过将输入序列包装为选项后跟None,我们可以处理另一个。之后,我们跳过中间结果并使用Seq.choose删除状态。

为了获得最大的多功能性,我想建议一个类似Seq.groupBy

的签名
f:('T -> 'Key) -> xs:seq<'T> -> seq<'Key * 'T list> when 'Key : equality

将关键投影函数作为第一个参数。

let chunkBy (f : 'T-> 'Key) xs =
    // Determine key and wrap in option
    seq{for x in xs -> Some(f x, x)
        // Indicates end of sequence
        yield None }
    |> Seq.scan (fun (_, acc, previous) current ->
        match previous, current with
        | Some(pKey, _), Some(key, value) when pKey = key ->
            // No intermediate result, but add to accumulator
            None, value::acc, current
        | _ ->
            // New state is 3-tuple of previous key and completed chunk,
            // accumulator from current element, and new previous element
            Option.map (fun (k, _) -> k, List.rev acc) previous,
            Option.map snd current |> Option.toList, current )
        (None, [], None)
    |> Seq.choose (fun (result, _, _) -> result)

这也可以通过提供结果投影功能来满足OP的要求。

let chunkBy2 (f : 'T-> 'Key) (g : 'T->'Result)  =
    chunkBy f >> Seq.map (fun (k, gs) -> k, List.map g gs)
// val chunkBy2 :
//   f:('T -> 'Key) -> g:('T -> 'Result) -> (seq<'T> -> seq<'Key * 'Result list>)
//      when 'Key : equality

["a", 1; "a", 2; "a", 111; "b", 3; "bb", 1; "bb", -1]
|> chunkBy2 fst snd
// val it : seq<string * int list> =
//   seq [("a", [1; 2; 111]); ("b", [3]); ("bb", [1; -1])]

Seq.initInfinite (fun x ->
    if (x / 2) % 2 = 0 then "a", x else "b", x)
|> chunkBy2 fst snd
|> Seq.skip 50000
// val it : seq<string * int list> =
//   seq
//     [("a", [100000; 100001]); ("b", [100002; 100003]); ("a", [100004; 100005]);
//      ("b", [100006; 100007]); ...]