为什么fsharp自生生的gethashcode会产生太多的冲突?

时间:2014-11-07 12:20:44

标签: f#

在我们的fsharp代码中,自动生成的gethashcode实现显示了非常糟糕的性能和大的冲突率。这是gethashcode生成器的fsharp实现中的问题还是只是边缘情况?

open System
open System.Collections.Generic

let check keys e name =
    let dict =  new Dictionary<_,_>(Array.length keys, e)//, HashIdentity.Structural)
    let stopWatch = System.Diagnostics.Stopwatch.StartNew()
    let add k = dict.Add(k, 1.02)
    Array.iter add keys    
    stopWatch.Stop()
    let hsahes = new HashSet<int>()
    let add_hash x =  hsahes.Add(e.GetHashCode(x)) |> not
    let collisions = Array.filter add_hash keys |> Array.length
    printfn "%s %f sec %f collisions" name stopWatch.Elapsed.TotalSeconds (double(collisions) / double(keys.Length))

type StructTuple<'T,'T2> =
   struct
      val fst: 'T
      val snd : 'T2
      new(fst: 'T, snd : 'T2) = {fst = fst; snd = snd}
   end

let bad_keys = seq{
    let rnd = new Random();
    while true do
        let j = uint32(rnd.Next(0, 3346862))
        let k = uint16 (rnd.Next(0, 658))
        yield StructTuple(j,k)
}

let good_keys = seq{
    for k in 0us..658us do
        for j in 0u.. 3346862u do
            yield StructTuple(j,k)
}

module CmpHelpers = 
    let inline combine (h1:int) (h2:int) = (h1 <<< 5) + h1 ^^^ h2;

type StructTupleComparer<'T,'T2>() =
    let cmparer = EqualityComparer<Object>.Default
    interface IEqualityComparer<StructTuple<'T,'T2>> with
        member this.Equals (a,b) = cmparer.Equals(a.fst, b.fst) && cmparer.Equals(a.snd, b.snd)
        member this.GetHashCode (x) = CmpHelpers.combine (cmparer.GetHashCode(x.fst)) (cmparer.GetHashCode(x.snd)) 

type AutoGeneratedStructTupleComparer<'T,'T2>() =
    let cmparer = LanguagePrimitives.GenericEqualityComparer
    interface IEqualityComparer<StructTuple<'T,'T2>> with
        member this.Equals (a:StructTuple<'T,'T2>,b:StructTuple<'T,'T2>) = 
                LanguagePrimitives.HashCompare.GenericEqualityERIntrinsic<'T> a.fst b.fst
                    && LanguagePrimitives.HashCompare.GenericEqualityERIntrinsic<'T2> a.snd b.snd
        member this.GetHashCode (x:StructTuple<'T,'T2>) = 
                let mutable num = 0
                num <- -1640531527 + (LanguagePrimitives.HashCompare.GenericHashWithComparerIntrinsic<'T2> cmparer x.snd + ((num <<< 6) + (num >>> 2)))
                -1640531527 + (LanguagePrimitives.HashCompare.GenericHashWithComparerIntrinsic<'T> cmparer x.fst + ((num <<< 6) + (num >>> 2)));


let uniq (sq:seq<'a>) = Array.ofSeq (new HashSet<_>(sq)) 

[<EntryPoint>]
let main argv = 
    let count = 15000000
    let keys = good_keys |> Seq.take count |> uniq
    printfn "good keys"
    check keys (new StructTupleComparer<_,_>()) "struct custom"
    check keys HashIdentity.Structural "struct auto"
    check keys (new AutoGeneratedStructTupleComparer<_,_>()) "struct auto explicit"


    let keys = bad_keys |> Seq.take count |>  uniq
    printfn "bad keys"
    check keys (new StructTupleComparer<_,_>()) "struct custom"
    check keys HashIdentity.Structural "struct auto"
    check keys (new AutoGeneratedStructTupleComparer<_,_>()) "struct auto explicit"



    Console.ReadLine() |> ignore
    0 // return an integer exit code

输出

好的钥匙

struct custom 1.506934 sec 0.000000 collisions

struct auto 4.832881 sec 0.776863 collisions

struct auto explicit 3.166931 sec 0.776863 collisions

坏键

struct custom 3.631251 sec 0.061893 collisions

struct auto 10.340693 sec 0.777034 collisions

struct auto explicit 8.893612 sec 0.777034 collisions

2 个答案:

答案 0 :(得分:3)

我不是用于生成自动生成的Equals和GetHashCode的整体算法的专家,但它似乎只是在这里产生一些非最优的东西。我不知道这是否适用于通用自动生成的实现,或者是否有可靠的自动生成接近最优的实现方式。

值得注意的是,如果您只使用标准元组,则自动生成的散列和比较会提供与自定义实现相同的冲突率和相同的性能。使用最新的F#4.0位(此区域最近有a significant perf improvement),自动生成的东西变得比自定义实现快得多。

我的号码:

// F# 3.1, struct tuples
good keys
  custom 0.951254 sec 0.000000 collisions
  auto 2.737166 sec 0.776863 collisions
bad keys
  custom 2.923103 sec 0.061869 collisions
  auto 7.706678 sec 0.777040 collisions

// F# 3.1, standard tuples
good keys
  custom 0.995701 sec 0.000000 collisions
  auto 0.965949 sec 0.000000 collisions
bad keys
  custom 3.091821 sec 0.061869 collisions
  auto 2.924721 sec 0.061869 collisions

// F# 4.0, standard tuples
good keys
  custom 1.018672 sec 0.000000 collisions
  auto 0.619066 sec 0.000000 collisions
bad keys
  custom 3.082988 sec 0.061869 collisions
  auto 1.829720 sec 0.061869 collisions

答案 1 :(得分:1)

在fsharp问题跟踪器中打开了问题。接受为错误https://github.com/fsharp/fsharp/issues/343