我有一个很大的Excel文件,我在F#中使用Excel Provider阅读。
行应按某些列分组。处理与OutOfMemoryException
崩溃。不确定Seq.groupBy
调用是有罪的还是excel类型提供者。
为了简化它,我在这里使用3D Point作为一行。
type Point = { x : float; y: float; z: float; }
let points = seq {
for x in 1 .. 1000 do
for y in 1 .. 1000 do
for z in 1 .. 1000 ->
{x = float x; y = float y; z = float z}
}
let groups = points |> Seq.groupBy (fun point -> point.x)
行已按分组列排序,例如10点,x = 10,然后20点,x = 20,所以一个。我不需要对它们进行分组,而只需将行拆分为块,直到更改为止。有没有办法枚举序列只需一次并获得一些行分割,而不是按一些列值或一些f(行)值进行分组?
答案 0 :(得分:2)
如果行已经订购,则此chunkify函数将返回seq<'a list>。每个列表将包含具有相同x值的所有点。
let chunkify pred s = seq {
let values = ref []
for x in s do
match !values with
|h::t -> if pred h x then
values := x::!values
else
yield !values
values := [x]
|[] -> values := [x]
yield !values
}
let chunked = points |> chunkify (fun x y -> x.x = y.x)
这里的chunked有一种
seq<Point list>
答案 1 :(得分:0)
似乎没有一行纯功能解决方案或已经定义的Seq方法,我已经监督过。
因此,作为替代方案,我自己的必要解决方案。与@Kevin的答案相似,但实际上满足了我的更多需求。 ref单元格包含:
Seq.groupBy
),其中包含输入顺序中f(x)
等于已存储组密钥(要求相等)的元素。 / LI>
let splitByChanged f xs =
let acc = ref (None,[])
seq {
for x in xs do
match !acc with
| None,_ ->
acc := Some (f x),[x]
| Some key, chunk when key = f x ->
acc := Some key, x::chunk
| Some key, chunk ->
let group = chunk |> Seq.toList |> List.rev
yield key, group
acc := Some (f x),[x]
match !acc with
| None,_ -> ()
| Some key,chunk ->
let group = chunk |> Seq.toList |> List.rev
yield key, group
}
points |> splitByChanged (fun point -> point.x)
该功能具有以下签名:
val splitByChanged :
f:('a -> 'b) -> xs:seq<'a> -> seq<'b * 'a list> when 'b : equality
欢迎提供更正,甚至更好的解决方案
答案 2 :(得分:0)
另一种解决方案,与凯文的
相同module Seq =
let chunkBy f src =
seq {
let chunk = ResizeArray()
let mutable key = Unchecked.defaultof<_>
for x in src do
let newKey = f x
if (chunk.Count <> 0) && (newKey <> key) then
yield chunk.ToArray()
chunk.Clear()
key <- newKey
chunk.Add(x)
}
// returns 2 arrays, each with 1000 elements
points |> Seq.chunkBy (fun pt -> pt.y) |> Seq.take 2
这是一种纯粹的功能性方法,它肯定更慢,更难理解。
module Seq =
let chunkByFold f src =
src
|> Seq.scan (fun (chunk, (key, carry)) x ->
let chunk = defaultArg carry chunk
let newKey = f x
if List.isEmpty chunk then [x], (newKey, None)
elif newKey = key then x :: chunk, (key, None)
else chunk, (newKey, Some([x]))) ([], (Unchecked.defaultof<_>, None))
|> Seq.filter (snd >> snd >> Option.isSome)
|> Seq.map fst
答案 3 :(得分:0)
让我们从输入
开始let count = 1000
type Point = { x : float; y: float; z: float; }
let points = seq {
for x in 1 .. count do
for y in 1 .. count do
for z in 1 .. count ->
{x = float x; y = float y; z = float z}
}
val count : int = 1000
type Point =
{x: float;
y: float;
z: float;}
val points : seq<Point>
如果我们尝试评估点数,那么我们会得到一个OutOfMemoryException:
points |> Seq.toList
System.OutOfMemoryException: Exception of type 'System.OutOfMemoryException' was thrown.
at Microsoft.FSharp.Collections.FSharpList`1.Cons(T head, FSharpList`1 tail)
at Microsoft.FSharp.Collections.SeqModule.ToList[T](IEnumerable`1 source)
at <StartupCode$FSI_0011>.$FSI_0011.main@()
Stopped due to error
可能是groupBy失败的原因,但我不确定。但它告诉我们,我们必须使用seq和yield来返回组。所以我们得到了这个实现:
let group groupBy points =
let mutable lst = [ ]
seq { for p in points do match lst with | [] -> lst <- [p] | p'::lst' when groupBy p' p -> lst <- p::lst | lst' -> lst <- [p]; yield lst' }
val group : groupBy:('a -> 'a -> bool) -> points:seq<'a> -> seq<'a list>
这不是最容易阅读的代码。它从点序列中获取每个点,并在满足groupBy函数时将其预先设置为累加器列表。如果不满足groupBy函数,则生成新的累加器列表并生成旧的累加器列表。请注意,累加器列表的顺序是相反的。
测试功能:
for g in group (fun p' p -> p'.x = p.x ) points do
printfn "%f %i" g.[0].x g.Length
很好地终止(一段时间后)。
其他具有错误修复和更好格式化的实现。
let group (groupBy : 'a -> 'b when 'b : equality) points =
let mutable lst = []
seq {
yield! seq {
for p in points do
match lst with
| [] -> lst <- [ p ]
| p' :: lst' when (groupBy p') = (groupBy p) -> lst <- p :: lst
| lst' ->
lst <- [ p ]
yield (groupBy lst'.Head, lst')
}
yield (groupBy lst.Head, lst)
}