分组直到改变顺序

时间:2015-06-11 10:47:50

标签: excel f# grouping sequence type-providers

我有一个很大的Excel文件,我在F#中使用Excel Provider阅读。 行应按某些列分组。处理与OutOfMemoryException崩溃。不确定Seq.groupBy调用是有罪的还是excel类型提供者。 为了简化它,我在这里使用3D Point作为一行。

type Point = { x : float; y: float; z: float; }
let points = seq {
    for x in 1 .. 1000 do
        for y in 1 .. 1000 do
            for z in 1 .. 1000 ->
                {x = float x; y = float y; z = float z}
    }
let groups = points |> Seq.groupBy (fun point -> point.x)

行已按分组列排序,例如10点,x = 10,然后20点,x = 20,所以一个。我不需要对它们进行分组,而只需将行拆分为块,直到更改为止。有没有办法枚举序列只需一次并获得一些行分割,而不是按一些列值或一些f(行)值进行分组?

4 个答案:

答案 0 :(得分:2)

如果行已经订购,则此chunkify函数将返回seq<'a list>。每个列表将包含具有相同x值的所有点。

let chunkify pred s = seq {
  let values = ref []
  for x in s do
    match !values with
    |h::t ->  if pred h x then
                values := x::!values
              else
                yield !values
                values := [x]
    |[] -> values := [x]
  yield !values
}

let chunked = points |> chunkify (fun x y -> x.x = y.x)

这里的chunked有一种

seq<Point list>

答案 1 :(得分:0)

似乎没有一行纯功能解决方案或已经定义的Seq方法,我已经监督过。

因此,作为替代方案,我自己的必要解决方案。与@Kevin的答案相似,但实际上满足了我的更多需求。 ref单元格包含:

  • 组密钥,每行只计算一次
  • 当前块列表(可以是seq以符合Seq.groupBy),其中包含输入顺序中f(x)等于已存储组密钥(要求相等)的元素。 / LI>

let splitByChanged f xs =
    let acc = ref (None,[])
    seq {
        for x in xs do
            match !acc with
            | None,_ ->
                acc := Some (f x),[x]
            | Some key, chunk when key = f x ->
                acc := Some key, x::chunk
            | Some key, chunk -> 
                let group = chunk |> Seq.toList |> List.rev
                yield key, group
                acc := Some (f x),[x]
        match !acc with
        | None,_ -> ()
        | Some key,chunk -> 
            let group = chunk |> Seq.toList |> List.rev
            yield key, group
    }
    points |> splitByChanged (fun point -> point.x)

该功能具有以下签名:

    val splitByChanged :
      f:('a -> 'b) -> xs:seq<'a> -> seq<'b * 'a list> when 'b : equality

欢迎提供更正,甚至更好的解决方案

答案 2 :(得分:0)

另一种解决方案,与凯文的

相同
module Seq = 
    let chunkBy f src = 
        seq { 
            let chunk = ResizeArray()
            let mutable key = Unchecked.defaultof<_>
            for x in src do
                let newKey = f x
                if (chunk.Count <> 0) && (newKey <> key) then 
                    yield chunk.ToArray()
                    chunk.Clear()
                key <- newKey
                chunk.Add(x)
        }

// returns 2 arrays, each with 1000 elements
points |> Seq.chunkBy (fun pt -> pt.y) |> Seq.take 2

这是一种纯粹的功能性方法,它肯定更慢,更难理解。

module Seq = 
    let chunkByFold f src = 
        src
        |> Seq.scan (fun (chunk, (key, carry)) x -> 
               let chunk = defaultArg carry chunk
               let newKey = f x
               if List.isEmpty chunk then [x], (newKey, None)
               elif newKey = key then x :: chunk, (key, None)
               else chunk, (newKey, Some([x]))) ([], (Unchecked.defaultof<_>, None))
        |> Seq.filter (snd >> snd >> Option.isSome)
        |> Seq.map fst

答案 3 :(得分:0)

让我们从输入

开始
let count = 1000

type Point = { x : float; y: float; z: float; }

let points = seq {
    for x in 1 .. count do
        for y in 1 .. count do
            for z in 1 .. count ->
                {x = float x; y = float y; z = float z}
    }

val count : int = 1000
type Point =
  {x: float;
   y: float;
   z: float;}
val points : seq<Point>

如果我们尝试评估点数,那么我们会得到一个OutOfMemoryException:

points |> Seq.toList
System.OutOfMemoryException: Exception of type 'System.OutOfMemoryException' was thrown.
   at Microsoft.FSharp.Collections.FSharpList`1.Cons(T head, FSharpList`1 tail)
   at Microsoft.FSharp.Collections.SeqModule.ToList[T](IEnumerable`1 source)
   at <StartupCode$FSI_0011>.$FSI_0011.main@()
Stopped due to error

可能是groupBy失败的原因,但我不确定。但它告诉我们,我们必须使用seq和yield来返回组。所以我们得到了这个实现:

let group groupBy points = 
    let mutable lst = [  ]
    seq { for p in points do match lst with | [] -> lst <- [p] | p'::lst' when groupBy  p' p -> lst <- p::lst | lst' -> lst <- [p]; yield lst' }

val group : groupBy:('a -> 'a -> bool) -> points:seq<'a> -> seq<'a list>

这不是最容易阅读的代码。它从点序列中获取每个点,并在满足groupBy函数时将其预先设置为累加器列表。如果不满足groupBy函数,则生成新的累加器列表并生成旧的累加器列表。请注意,累加器列表的顺序是相反的。

测试功能:

for g in group (fun p' p -> p'.x = p.x ) points do
    printfn "%f %i" g.[0].x g.Length

很好地终止(一段时间后)。

其他具有错误修复和更好格式化的实现。

let group (groupBy : 'a -> 'b when 'b : equality) points = 
    let mutable lst = []
    seq { 
        yield! seq { 
                   for p in points do
                       match lst with
                       | [] -> lst <- [ p ]
                       | p' :: lst' when (groupBy p') = (groupBy p) -> lst <- p :: lst
                       | lst' -> 
                           lst <- [ p ]
                           yield (groupBy lst'.Head, lst')
               } 
        yield (groupBy lst.Head, lst)
    }