F#中的Eratosthenes筛

时间:2011-01-07 20:01:30

标签: algorithm f# sieve-of-eratosthenes

我对纯粹功能性F#中sieve of eratosthenes的实现感兴趣。我对实际筛子not the naive functional implementation that isn't really the sieve的实施感兴趣,所以不是这样的:

let rec PseudoSieve list =
    match list with
    | hd::tl -> hd :: (PseudoSieve <| List.filter (fun x -> x % hd <> 0) tl)
    | [] -> []

上面的第二个链接简要描述了一种需要使用多图的算法,据我所知,这在F#中是不可用的。给出的Haskell实现使用支持insertWith方法的地图,我在F# functional map中没有看到。

有没有人知道将给定的Haskell映射代码转换为F#的方法,或者可能知道替代实现方法或筛选算法哪些有效且更适合功能实现或F#?

16 个答案:

答案 0 :(得分:16)

阅读那篇文章我提出了一个不需要多图的想法。它通过一次又一次地将碰撞键向前移动其素值来处理碰撞地图键,直到它到达不在地图中的键。低于primes的是一个映射,其中包含下一个迭代器值的键和值为素数的值。

let primes = 
    let rec nextPrime n p primes =
        if primes |> Map.containsKey n then
            nextPrime (n + p) p primes
        else
            primes.Add(n, p)

    let rec prime n primes =
        seq {
            if primes |> Map.containsKey n then
                let p = primes.Item n
                yield! prime (n + 1) (nextPrime (n + p) p (primes.Remove n))
            else
                yield n
                yield! prime (n + 1) (primes.Add(n * n, n))
        }

    prime 2 Map.empty

这是基于优先级队列的算法,paper没有方形优化。我将通用优先级队列函数放在顶部。我用一个元组来表示惰性列表迭代器。

let primes() = 
    // the priority queue functions
    let insert = Heap.Insert
    let findMin = Heap.Min
    let insertDeleteMin = Heap.DeleteInsert

    // skips primes 2, 3, 5, 7
    let wheelData = [|2L;4L;2L;4L;6L;2L;6L;4L;2L;4L;6L;6L;2L;6L;4L;2L;6L;4L;6L;8L;4L;2L;4L;2L;4L;8L;6L;4L;6L;2L;4L;6L;2L;6L;6L;4L;2L;4L;6L;2L;6L;4L;2L;4L;2L;10L;2L;10L|]

    // increments iterator
    let wheel (composite, n, prime) =
        composite + wheelData.[n % 48] * prime, n + 1, prime

    let insertPrime prime n table =
        insert (prime * prime, n, prime) table

    let rec adjust x (table : Heap) =
        let composite, n, prime = findMin table

        if composite <= x then 
            table 
            |> insertDeleteMin (wheel (composite, n, prime))
            |> adjust x
        else
            table

    let rec sieve iterator table =
        seq {
            let x, n, _ = iterator
            let composite, _, _ = findMin table

            if composite <= x then
                yield! sieve (wheel iterator) (adjust x table)
            else
                if x = 13L then
                    yield! [2L; 3L; 5L; 7L; 11L]

                yield x
                yield! sieve (wheel iterator) (insertPrime x n table)
        }

    sieve (13L, 1, 1L) (insertPrime 11L 0 (Heap(0L, 0, 0L)))

这是基于优先级队列的算法和方形优化。为了便于延迟向查询表添加素数,必须将轮偏移与素数值一起返回。此版本的算法具有O(sqrt(n))内存使用,其中无优化的一个是O(n)。

let rec primes2() : seq<int64 * int> = 
    // the priority queue functions
    let insert = Heap.Insert
    let findMin = Heap.Min
    let insertDeleteMin = Heap.DeleteInsert

    // increments iterator
    let wheel (composite, n, prime) =
        composite + wheelData.[n % 48] * prime, n + 1, prime

    let insertPrime enumerator composite table =
        // lazy initialize the enumerator
        let enumerator =
            if enumerator = null then
                let enumerator = primes2().GetEnumerator()
                enumerator.MoveNext() |> ignore
                // skip primes that are a part of the wheel
                while fst enumerator.Current < 11L do
                    enumerator.MoveNext() |> ignore
                enumerator
            else
                enumerator

        let prime = fst enumerator.Current
        // Wait to insert primes until their square is less than the tables current min
        if prime * prime < composite then
            enumerator.MoveNext() |> ignore
            let prime, n = enumerator.Current
            enumerator, insert (prime * prime, n, prime) table
        else
            enumerator, table

    let rec adjust x table =
        let composite, n, prime = findMin table

        if composite <= x then 
            table 
            |> insertDeleteMin (wheel (composite, n, prime))
            |> adjust x
        else
            table

    let rec sieve iterator (enumerator, table) = 
        seq {
            let x, n, _ = iterator
            let composite, _, _ = findMin table

            if composite <= x then
                yield! sieve (wheel iterator) (enumerator, adjust x table)
            else
                if x = 13L then
                    yield! [2L, 0; 3L, 0; 5L, 0; 7L, 0; 11L, 0]

                yield x, n
                yield! sieve (wheel iterator) (insertPrime enumerator composite table)
        }

    sieve (13L, 1, 1L) (null, insert (11L * 11L, 0, 11L) (Heap(0L, 0, 0L)))

这是我的测试程序。

type GenericHeap<'T when 'T : comparison>(defaultValue : 'T) =
    let mutable capacity = 1
    let mutable values = Array.create capacity defaultValue
    let mutable size = 0

    let swap i n =
        let temp = values.[i]
        values.[i] <- values.[n]
        values.[n] <- temp

    let rec rollUp i =
        if i > 0 then
            let parent = (i - 1) / 2
            if values.[i] < values.[parent] then
                swap i parent
                rollUp parent

    let rec rollDown i =
        let left, right = 2 * i + 1, 2 * i + 2

        if right < size then
            if values.[left] < values.[i] then
                if values.[left] < values.[right] then
                    swap left i
                    rollDown left
                else
                    swap right i
                    rollDown right
            elif values.[right] < values.[i] then
                swap right i
                rollDown right
        elif left < size then
            if values.[left] < values.[i] then
                swap left i

    member this.insert (value : 'T) =
        if size = capacity then
            capacity <- capacity * 2
            let newValues = Array.zeroCreate capacity
            for i in 0 .. size - 1 do
                newValues.[i] <- values.[i]
            values <- newValues

        values.[size] <- value
        size <- size + 1
        rollUp (size - 1)

    member this.delete () =
        values.[0] <- values.[size]
        size <- size - 1
        rollDown 0

    member this.deleteInsert (value : 'T) =
        values.[0] <- value
        rollDown 0

    member this.min () =
        values.[0]

    static member Insert (value : 'T) (heap : GenericHeap<'T>) =
        heap.insert value
        heap    

    static member DeleteInsert (value : 'T) (heap : GenericHeap<'T>) =
        heap.deleteInsert value
        heap    

    static member Min (heap : GenericHeap<'T>) =
        heap.min()

type Heap = GenericHeap<int64 * int * int64>

let wheelData = [|2L;4L;2L;4L;6L;2L;6L;4L;2L;4L;6L;6L;2L;6L;4L;2L;6L;4L;6L;8L;4L;2L;4L;2L;4L;8L;6L;4L;6L;2L;4L;6L;2L;6L;6L;4L;2L;4L;6L;2L;6L;4L;2L;4L;2L;10L;2L;10L|]

let primes() = 
    // the priority queue functions
    let insert = Heap.Insert
    let findMin = Heap.Min
    let insertDeleteMin = Heap.DeleteInsert

    // increments iterator
    let wheel (composite, n, prime) =
        composite + wheelData.[n % 48] * prime, n + 1, prime

    let insertPrime prime n table =
        insert (prime * prime, n, prime) table

    let rec adjust x (table : Heap) =
        let composite, n, prime = findMin table

        if composite <= x then 
            table 
            |> insertDeleteMin (wheel (composite, n, prime))
            |> adjust x
        else
            table

    let rec sieve iterator table =
        seq {
            let x, n, _ = iterator
            let composite, _, _ = findMin table

            if composite <= x then
                yield! sieve (wheel iterator) (adjust x table)
            else
                if x = 13L then
                    yield! [2L; 3L; 5L; 7L; 11L]

                yield x
                yield! sieve (wheel iterator) (insertPrime x n table)
        }

    sieve (13L, 1, 1L) (insertPrime 11L 0 (Heap(0L, 0, 0L)))

let rec primes2() : seq<int64 * int> = 
    // the priority queue functions
    let insert = Heap.Insert
    let findMin = Heap.Min
    let insertDeleteMin = Heap.DeleteInsert

    // increments iterator
    let wheel (composite, n, prime) =
        composite + wheelData.[n % 48] * prime, n + 1, prime

    let insertPrime enumerator composite table =
        // lazy initialize the enumerator
        let enumerator =
            if enumerator = null then
                let enumerator = primes2().GetEnumerator()
                enumerator.MoveNext() |> ignore
                // skip primes that are a part of the wheel
                while fst enumerator.Current < 11L do
                    enumerator.MoveNext() |> ignore
                enumerator
            else
                enumerator

        let prime = fst enumerator.Current
        // Wait to insert primes until their square is less than the tables current min
        if prime * prime < composite then
            enumerator.MoveNext() |> ignore
            let prime, n = enumerator.Current
            enumerator, insert (prime * prime, n, prime) table
        else
            enumerator, table

    let rec adjust x table =
        let composite, n, prime = findMin table

        if composite <= x then 
            table 
            |> insertDeleteMin (wheel (composite, n, prime))
            |> adjust x
        else
            table

    let rec sieve iterator (enumerator, table) = 
        seq {
            let x, n, _ = iterator
            let composite, _, _ = findMin table

            if composite <= x then
                yield! sieve (wheel iterator) (enumerator, adjust x table)
            else
                if x = 13L then
                    yield! [2L, 0; 3L, 0; 5L, 0; 7L, 0; 11L, 0]

                yield x, n
                yield! sieve (wheel iterator) (insertPrime enumerator composite table)
        }

    sieve (13L, 1, 1L) (null, insert (11L * 11L, 0, 11L) (Heap(0L, 0, 0L)))


let mutable i = 0

let compare a b =
    i <- i + 1
    if a = b then
        true
    else
        printfn "%A %A %A" a b i
        false

Seq.forall2 compare (Seq.take 50000 (primes())) (Seq.take 50000 (primes2() |> Seq.map fst))
|> printfn "%A"

primes2()
|> Seq.map fst
|> Seq.take 10
|> Seq.toArray
|> printfn "%A"

primes2()
|> Seq.map fst
|> Seq.skip 999999
|> Seq.take 10
|> Seq.toArray
|> printfn "%A"

System.Console.ReadLine() |> ignore

答案 1 :(得分:9)

虽然one answerPriority Queue (PQ)使用SkewBinomialHeap给出了balanced binary search tree算法,但它可能不是正确的PQ。 Eratosthenes(iEoS)所需的增量Sieve是一个PQ,它具有出色的性能,可以获得最小值,并且重新插入值大部分进一步排在队列之下,但是由于iSoE只增加了新值,因此不需要最佳性能作为新值,总数达到范围的平方根(这是每次减少时重复插入次数的一小部分)。 SkewBinomialHeap PQ并没有比使用内置Map更多,它使用Heap - 所有O(log n)操作 - 除了它稍微改变了操作的权重以支持SoE的要求。但是,SkewBinaryHeap每次还原仍需要很多O(log n)操作。

PQ实现为Binary Heap,更具体地说是AVL tree,更具体地说,作为MinHeap,它几乎满足了iSoE的要求,其中O(1)性能最低,并且O(log n)重新插入和添加新条目的性能,虽然性能实际上是O(log n)的一小部分,因为大多数重新插入发生在队列顶部附近并且大多数新值的添加(这不重要,因为它们不常见)发生在队列末端附近,这些操作最有效。此外,MinHeap PQ可以在一个(通常是一小部分)一个O(log n)遍中有效地实现删除最小值和插入函数。然后,而不是Map(实现为Dustin Cambell's mutable Dictionary based code),其中有一个O(log n)操作,通常有一个完整的&n; log n&#39;由于我们需要在树的最左边的最后一个叶子处的最小值,因此我们通常在根处添加和移除最小值并且在一次通过中平均插入几个级别。因此,MinHeap PQ只能在每次剔除减少时使用O(log n)操作的一小部分而不是多个较大的O(log n)次操作。

MinHeap PQ可以使用纯功能代码实现(没有&#34; removeMin&#34;由于iSoE不需要它而实现,但有一个&#34; adjust&#34;功能可供使用在细分中),如下:

[<RequireQualifiedAccess>]
module MinHeap =

  type MinHeapTreeEntry<'T> = class val k:uint32 val v:'T new(k,v) = { k=k;v=v } end
  [<CompilationRepresentation(CompilationRepresentationFlags.UseNullAsTrueValue)>]
  [<NoEquality; NoComparison>]
  type MinHeapTree<'T> = 
      | HeapEmpty 
      | HeapOne of MinHeapTreeEntry<'T>
      | HeapNode of MinHeapTreeEntry<'T> * MinHeapTree<'T> * MinHeapTree<'T> * uint32

  let empty = HeapEmpty

  let getMin pq = match pq with | HeapOne(kv) | HeapNode(kv,_,_,_) -> Some kv | _ -> None

  let insert k v pq =
    let kv = MinHeapTreeEntry(k,v)
    let rec insert' kv msk pq =
      match pq with
        | HeapEmpty -> HeapOne kv
        | HeapOne kv2 -> if k < kv2.k then HeapNode(kv,pq,HeapEmpty,2u)
                          else let nn = HeapOne kv in HeapNode(kv2,nn,HeapEmpty,2u)
        | HeapNode(kv2,l,r,cnt) ->
          let nc = cnt + 1u
          let nmsk = if msk <> 0u then msk <<< 1
                     else let s = int32 (System.Math.Log (float nc) / System.Math.Log(2.0))
                          (nc <<< (32 - s)) ||| 1u //never ever zero again with the or'ed 1
          if k <= kv2.k then if (nmsk &&& 0x80000000u) = 0u then HeapNode(kv,insert' kv2 nmsk l,r,nc)
                                                            else HeapNode(kv,l,insert' kv2 nmsk r,nc)
          else if (nmsk &&& 0x80000000u) = 0u then HeapNode(kv2,insert' kv nmsk l,r,nc)
                else HeapNode(kv2,l,insert' kv nmsk r,nc)
    insert' kv 0u pq

  let private reheapify kv k pq =
    let rec reheapify' pq =
      match pq with
        | HeapEmpty -> HeapEmpty //should never be taken
        | HeapOne kvn -> HeapOne kv
        | HeapNode(kvn,l,r,cnt) ->
            match r with
              | HeapOne kvr when k > kvr.k ->
                  match l with //never HeapEmpty
                    | HeapOne kvl when k > kvl.k -> //both qualify, choose least
                        if kvl.k > kvr.k then HeapNode(kvr,l,HeapOne kv,cnt)
                        else HeapNode(kvl,HeapOne kv,r,cnt)
                    | HeapNode(kvl,_,_,_) when k > kvl.k -> //both qualify, choose least
                        if kvl.k > kvr.k then HeapNode(kvr,l,HeapOne kv,cnt)
                        else HeapNode(kvl,reheapify' l,r,cnt)
                    | _ -> HeapNode(kvr,l,HeapOne kv,cnt) //only right qualifies
              | HeapNode(kvr,_,_,_) when k > kvr.k -> //need adjusting for left leaf or else left leaf
                  match l with //never HeapEmpty or HeapOne
                    | HeapNode(kvl,_,_,_) when k > kvl.k -> //both qualify, choose least
                        if kvl.k > kvr.k then HeapNode(kvr,l,reheapify' r,cnt)
                        else HeapNode(kvl,reheapify' l,r,cnt)
                    | _ -> HeapNode(kvr,l,reheapify' r,cnt) //only right qualifies
              | _ -> match l with //r could be HeapEmpty but l never HeapEmpty
                        | HeapOne(kvl) when k > kvl.k -> HeapNode(kvl,HeapOne kv,r,cnt)
                        | HeapNode(kvl,_,_,_) when k > kvl.k -> HeapNode(kvl,reheapify' l,r,cnt)
                        | _ -> HeapNode(kv,l,r,cnt) //just replace the contents of pq node with sub leaves the same
    reheapify' pq


  let reinsertMinAs k v pq =
    let kv = MinHeapTreeEntry(k,v)
    reheapify kv k pq

  let adjust f (pq:MinHeapTree<_>) = //adjust all the contents using the function, then rebuild by reheapify
    let rec adjust' pq =
      match pq with
        | HeapEmpty -> pq
        | HeapOne kv -> HeapOne(MinHeapTreeEntry(f kv.k kv.v))
        | HeapNode (kv,l,r,cnt) -> let nkv = MinHeapTreeEntry(f kv.k kv.v)
                                   reheapify nkv nkv.k (HeapNode(kv,adjust' l,adjust' r,cnt))
    adjust' pq

使用上述模块,可以使用车轮因子分解优化和使用高效的共感应流(CIS&#39;)编写iSoE,如下所示:

type CIS<'T> = class val v:'T val cont:unit->CIS<'T> new(v,cont) = { v=v;cont=cont } end
type cullstate = struct val p:uint32 val wi:int new(cnd,cndwi) = { p=cnd;wi=cndwi } end
let primesPQWSE() =
  let WHLPRMS = [| 2u;3u;5u;7u |] in let FSTPRM = 11u in let WHLCRC = int (WHLPRMS |> Seq.fold (*) 1u) >>> 1
  let WHLLMT = int (WHLPRMS |> Seq.fold (fun o n->o*(n-1u)) 1u) - 1
  let WHLPTRN =
    let wp = Array.zeroCreate (WHLLMT+1)
    let gaps (a:int[]) = let rec gap i acc = if a.[i]=0 then gap (i+1) (acc+1uy) else acc
                         {0..WHLCRC-1} |> Seq.fold (fun s i->
                           let ns = if a.[i]<>0 then wp.[s]<-2uy*gap (i+1) 1uy;(s+1) else s in ns) 0 |> ignore
    Array.init (WHLCRC+1) (fun i->if WHLPRMS |> Seq.forall (fun p->(FSTPRM+uint32(i<<<1))%p<>0u)
                                  then 1 else 0) |> gaps;wp
  let inline whladv i = if i < WHLLMT then i + 1 else 0 in let advcnd c i = c + uint32 WHLPTRN.[i]
  let inline culladv c p i = let n = c + uint32 WHLPTRN.[i] * p in if n < c then 0xFFFFFFFFu else n
  let rec mkprm (n,wi,pq,(bps:CIS<_>),q) =
    let nxt = advcnd n wi in let nxti = whladv wi
    if nxt < n then (0u,0,(0xFFFFFFFFu,0,MinHeap.empty,bps,q))
    elif n>=q then let bp,bpi = bps.v in let nc,nci = culladv n bp bpi,whladv bpi
                    let nsd = bps.cont() in let np,_ = nsd.v in let sqr = if np>65535u then 0xFFFFFFFFu else np*np
                    mkprm (nxt,nxti,(MinHeap.insert nc (cullstate(bp,nci)) pq),nsd,sqr)
    else match MinHeap.getMin pq with | None -> (n,wi,(nxt,nxti,pq,bps,q))
                                      | Some kv -> let ca,cs = culladv kv.k kv.v.p kv.v.wi,cullstate(kv.v.p,whladv kv.v.wi)
                                                   if n>kv.k then mkprm (n,wi,(MinHeap.reinsertMinAs ca cs pq),bps,q)
                                                   elif n=kv.k then mkprm (nxt,nxti,(MinHeap.reinsertMinAs ca cs pq),bps,q)
                                                   else (n,wi,(nxt,nxti,pq,bps,q))
  let rec pCID p pi pq bps q = CIS((p,pi),fun()->let (np,npi,(nxt,nxti,npq,nbps,nq))=mkprm (advcnd p pi,whladv pi,pq,bps,q)
                                                 pCID np npi npq nbps nq)
  let rec baseprimes() = CIS((FSTPRM,0),fun()->let np=FSTPRM+uint32 WHLPTRN.[0]
                                               pCID np (whladv 0) MinHeap.empty (baseprimes()) (FSTPRM*FSTPRM))
  let genseq sd = Seq.unfold (fun (p,pi,pcc) ->if p=0u then None else Some(p,mkprm pcc)) sd
  seq { yield! WHLPRMS; yield! mkprm (FSTPRM,0,MinHeap.empty,baseprimes(),(FSTPRM*FSTPRM)) |> genseq }

上面的代码计算了大约0.077秒的前100,000个素数,0.977秒的前1,000,000个素数,大约14.33秒的前10,000,000个素数,以及大约221.87秒的前100,000,000个素数,全部在i7-2700K上( 3.5GHz)作为64位代码。这个纯粹的功能代码比tryfsharp的代码略快,增加了车轮分解的常见优化,延迟添加基本素数,并使用更高效的CID全部添加(ideoneimplemented as a mutable array但仍然是纯函数代码,他使用Dictionary类不是。然而,对于大约20亿(约1亿个素数)的较大素数范围,使用基于哈希表的字典的代码将更快,因为字典操作不具有O(log n)因子并且该增益克服了计算使用Dictionary哈希表的复杂性。

上述程序的另一个特点是分解轮被参数化,例如,通过将WHLPRMS设置为[|]可以使用极大的轮子。 2u; 3u; 5u; 7u; 11u; 13u; 17u; 19u |]和FSTPRM到23u,对于1000万个素数,大约9.34秒的大范围获得大约三分之二的运行时间,尽管注意它需要几秒钟在程序开始运行之前计算WHLPTRN,无论主要范围如何,这都是一个恒定的开销。

比较分析:与纯功能增量树折叠实现相比,此算法稍微快一些,因为MinHeap树的平均使用高度比深度小2倍。折叠的树,但由于它基于二进制堆,需要处理每个堆级别的左右叶和一个分支,因此可以通过等效的常量因子损失来抵消PQ树级别的效率。而不是每个级别的树木折叠的单一比较,通常采用较少的深度分支。与其他基于PQ和Map的函数算法相比,改进通常是通过减少遍历各个树结构的每个级别的O(log n)操作的数量的常数因素。

binary heap 400多年前发明的基于谱系树的模型之后,MinHeap通常是Michael Eytzinger Jon Harrop's code。我知道这个问题说对非功能性可变代码没有兴趣,但是如果必须避免使用所有使用可变性的子代码,那么我们就不能使用使用可变性的列表或者LazyList。 &#34;在封面下#34;出于性能原因。因此,想象一下MinHeap PQ的以下备用可变版本是由库提供的,并且在性能上有更大的素数范围,可以享受另外两个因素:

[<RequireQualifiedAccess>]
module MinHeap =

  type MinHeapTreeEntry<'T> = class val k:uint32 val v:'T new(k,v) = { k=k;v=v } end
  type MinHeapTree<'T> = ResizeArray<MinHeapTreeEntry<'T>>

  let empty<'T> = MinHeapTree<MinHeapTreeEntry<'T>>()

  let getMin (pq:MinHeapTree<_>) = if pq.Count > 0 then Some pq.[0] else None

  let insert k v (pq:MinHeapTree<_>) =
    if pq.Count = 0 then pq.Add(MinHeapTreeEntry(0xFFFFFFFFu,v)) //add an extra entry so there's always a right max node
    let mutable nxtlvl = pq.Count in let mutable lvl = nxtlvl <<< 1 //1 past index of value added times 2
    pq.Add(pq.[nxtlvl - 1]) //copy bottom entry then do bubble up while less than next level up
    while ((lvl <- lvl >>> 1); nxtlvl <- nxtlvl >>> 1; nxtlvl <> 0) do
      let t = pq.[nxtlvl - 1] in if t.k > k then pq.[lvl - 1] <- t else lvl <- lvl <<< 1; nxtlvl <- 0 //causes loop break
    pq.[lvl - 1] <-  MinHeapTreeEntry(k,v); pq

  let reinsertMinAs k v (pq:MinHeapTree<_>) = //do minify down for value to insert
    let mutable nxtlvl = 1 in let mutable lvl = nxtlvl in let cnt = pq.Count
    while (nxtlvl <- nxtlvl <<< 1; nxtlvl < cnt) do
      let lk = pq.[nxtlvl - 1].k in let rk = pq.[nxtlvl].k in let oldlvl = lvl
      let k = if k > lk then lvl <- nxtlvl; lk else k in if k > rk then nxtlvl <- nxtlvl + 1; lvl <- nxtlvl
      if lvl <> oldlvl then pq.[oldlvl - 1] <- pq.[lvl - 1] else nxtlvl <- cnt //causes loop break
    pq.[lvl - 1] <- MinHeapTreeEntry(k,v); pq

  let adjust f (pq:MinHeapTree<_>) = //adjust all the contents using the function, then re-heapify
    if pq <> null then 
      let cnt = pq.Count
      if cnt > 1 then
        for i = 0 to cnt - 2 do //change contents using function
          let e = pq.[i] in let k,v = e.k,e.v in pq.[i] <- MinHeapTreeEntry (f k v)
        for i = cnt/2 downto 1 do //rebuild by reheapify
          let kv = pq.[i - 1] in let k = kv.k
          let mutable nxtlvl = i in let mutable lvl = nxtlvl
          while (nxtlvl <- nxtlvl <<< 1; nxtlvl < cnt) do
            let lk = pq.[nxtlvl - 1].k in let rk = pq.[nxtlvl].k in let oldlvl = lvl
            let k = if k > lk then lvl <- nxtlvl; lk else k in if k > rk then nxtlvl <- nxtlvl + 1; lvl <- nxtlvl
            if lvl <> oldlvl then pq.[oldlvl - 1] <- pq.[lvl - 1] else nxtlvl <- cnt //causes loop break
          pq.[lvl - 1] <- kv
    pq

极客注意:我实际上预计可变版本可以提供更好的性能提升,但由于嵌套的if-then-else代码结构和主要淘汰的随机行为,它在重新插入时陷入困境值意味着CPU分支预测对于大部分分支失败,导致每次剔除的CPU时钟周期增加10%,以重建指令预取高速缓存。

此算法唯一的其他常数因素性能增益是分段和使用多任务处理,以获得与CPU核心数量成比例的性能增益;然而,就目前而言,这是迄今为止最快的纯函数SoE算法,甚至使用函数MinHeap的纯函数形式也胜过简单的命令式实现,例如Johan Kullbom's Sieve of Atkinprimesieve(这是错误的他的时机只计算了素数到1000万而不是第1000万素数,但如果使用更好的优化,这些算法的速度会快5倍。当我们添加更大的轮分解的多线程时,功能代码和命令代码之间的比例约为5,因为命令式代码的计算复杂性比功能代码增加得快,而多线程帮助比较慢的功能代码更多的功能代码。更快的命令式代码,因为后者更接近枚举通过找到的素数所需的时间的基本限制。

EDIT_ADD:即使有人可以选择继续使用MinHeap的纯功能版本,添加高效细分以准备多线程也会轻微地#34;打破&#34; &#34;纯粹&#34;功能代码如下:1)传递复合剔除素数表示的最有效方法是段大小的打包位数组,2)虽然数组的大小是已知的,但使用数组理解来以函数方式初始化它并不高效,因为它使用&#34; ResizeArray&#34;在需要为每个x添加项复制自己的封面下(我认为当前实现时x&#39;是8)并且使用Array.init并不会在特定索引处跳过很多值, 3)因此,填充culled-composite数组的最简单方法是零创建正确大小的函数,然后运行一个初始化函数,该函数可以写入每个可变数组索引不超过一次。虽然这不是严格的&#34;功能&#34;,但它的关系在于数组被初始化然后再也不会被修改。

具有添加分段,多线程,可编程轮因子周长和许多性能调整的代码如下(除了一些添加的新常量,实现分段和多线程的额外调整代码是底部大约一半从&#34; prmspg&#34;函数开始的代码:

type prmsCIS = class val pg:uint16 val bg:uint16 val pi:int val cont:unit->prmsCIS
                     new(pg,bg,pi,nxtprmf) = { pg=pg;bg=bg;pi=pi;cont=nxtprmf } end
type cullstate = struct val p:uint32 val wi:int new(cnd,cndwi) = { p=cnd;wi=cndwi } end
let primesPQOWSE() =
  let WHLPRMS = [| 2u;3u;5u;7u;11u;13u;17u |] in let FSTPRM = 19u in let WHLCRC = int(WHLPRMS |> Seq.fold (*) 1u)
  let MXSTP = uint64(FSTPRM-1u) in let BFSZ = 1<<<11 in let NUMPRCS = System.Environment.ProcessorCount
  let WHLLMT = int (WHLPRMS |> Seq.fold (fun o n->o*(n-1u)) 1u) - 1 in let WHLPTRN = Array.zeroCreate (WHLLMT+1)
  let WHLRNDUP = let gaps (a:int[]) = let rec gap i acc = if a.[i]=0 then gap (i+1) (acc+1)
                                                          else acc in let b = a |> Array.scan (+) 0
                                      Array.init (WHLCRC>>>1) (fun i->
                                        if a.[i]=0 then 0 else let g=2*gap (i+1) 1 in WHLPTRN.[b.[i]]<-byte g;1)
                 Array.init WHLCRC (fun i->if WHLPRMS |> Seq.forall (fun p->(FSTPRM+uint32(i<<<1))%p<>0u) then 1 else 0)
                 |> gaps |> Array.scan (+) 0
  let WHLPOS = WHLPTRN |> Array.map (uint32) |> Array.scan (+) 0u in let advcnd cnd cndi = cnd + uint32 WHLPTRN.[cndi]
  let MINRNGSTP = if WHLLMT<=31 then uint32(32/(WHLLMT+1)*WHLCRC) else if WHLLMT=47 then uint32 WHLCRC<<<1 else uint32 WHLCRC
  let MINBFRNG = uint32((BFSZ<<<3)/(WHLLMT+1)*WHLCRC)/MINRNGSTP*MINRNGSTP
  let MINBFRNG = if MINBFRNG=0u then MINRNGSTP else MINBFRNG
  let inline whladv i = if i < WHLLMT then i+1 else 0 in let inline culladv c p i = c+uint32 WHLPTRN.[i]*p
  let rec mkprm (n,wi,pq,(bps:prmsCIS),q,lstp,bgap) =
    let nxt,nxti = advcnd n wi,whladv wi
    if n>=q then let p = (uint32 bps.bg<<<16)+uint32 bps.pg
                 let nbps,nxtcmpst,npi = bps.cont(),culladv n p bps.pi,whladv bps.pi
                 let pg = uint32 nbps.pg in let np = p+pg in let sqr = q+pg*((p<<<1)+pg) //only works to p < about 13 million
                 let nbps = prmsCIS(uint16 np,uint16(np>>>16),nbps.pi,nbps.cont) //therefore, algorithm only works to p^2 or about
                 mkprm (nxt,nxti,(MinHeap.insert nxtcmpst (cullstate(p,npi)) pq),nbps,sqr,lstp,(bgap+1us)) //1.7 * 10^14
    else match MinHeap.getMin pq with 
           | None -> (uint16(n-uint32 lstp),bgap,wi,(nxt,nxti,pq,bps,q,n,1us)) //fix with q is uint64
           | Some kv -> let ca,cs = culladv kv.k kv.v.p kv.v.wi,cullstate(kv.v.p,whladv kv.v.wi)
                        if n>kv.k then mkprm (n,wi,(MinHeap.reinsertMinAs ca cs pq),bps,q,lstp,bgap)
                        elif n=kv.k then mkprm (nxt,nxti,(MinHeap.reinsertMinAs ca cs pq),bps,q,lstp,(bgap+1us))
                        else (uint16(n-uint32 lstp),bgap,wi,(nxt,nxti,pq,bps,q,n,1us))
  let rec pCIS p pg bg pi pq bps q = prmsCIS(pg,bg,pi,fun()->
    let (npg,nbg,npi,(nxt,nxti,npq,nbps,nq,nl,ng))=mkprm (p+uint32 WHLPTRN.[pi],whladv pi,pq,bps,q,p,0us)
    pCIS (p+uint32 npg) npg nbg npi npq nbps nq)
  let rec baseprimes() = prmsCIS(uint16 FSTPRM,0us,0,fun()->
                           let np,npi=advcnd FSTPRM 0,whladv 0
                           pCIS np (uint16 WHLPTRN.[0]) 1us npi MinHeap.empty (baseprimes()) (FSTPRM*FSTPRM))
  let prmspg nxt adj pq bp q =
    //compute next buffer size rounded up to next even wheel circle so at least one each base prime hits the page
    let rng = max (((uint32(MXSTP+uint64(sqrt (float (MXSTP*(MXSTP+4UL*nxt))))+1UL)>>>1)+MINRNGSTP)/MINRNGSTP*MINRNGSTP) MINBFRNG
    let nxtp() = async {
      let rec addprms pqx (bpx:prmsCIS) qx = 
        if qx>=adj then pqx,bpx,qx //add primes to queue for new lower limit
        else let p = (uint32 bpx.bg<<<16)+uint32 bpx.pg in let nbps = bpx.cont()
             let pg = uint32 nbps.pg in let np = p+pg in let sqr = qx+pg*((p<<<1)+pg)
             let nbps = prmsCIS(uint16 np,uint16(np>>>16),nbps.pi,nbps.cont)
             addprms (MinHeap.insert qx (cullstate(p,bpx.pi)) pqx) nbps sqr
      let adjcinpg low k (v:cullstate) = //adjust the cull states for the new page low value
        let p = v.p in let WHLSPN = int64 WHLCRC*int64 p in let db = int64 p*int64 WHLPOS.[v.wi]
        let db = if k<low then let nk = int64(low-k)+db in nk-((nk/WHLSPN)*WHLSPN)
                 else let nk = int64(k-low) in if db<nk then db+WHLSPN-nk else db-nk
        let r = WHLRNDUP.[int((((db>>>1)%(WHLSPN>>>1))+int64 p-1L)/int64 p)] in let x = int64 WHLPOS.[r]*int64 p
        let r = if r>WHLLMT then 0 else r in let x = if x<db then x+WHLSPN-db else x-db in uint32 x,cullstate(p,r)
      let bfbtsz = int rng/WHLCRC*(WHLLMT+1) in let nbuf = Array.zeroCreate (bfbtsz>>>5)
      let rec nxtp' wi cnt = let _,nbg,_,ncnt = mkprm cnt in let nwi = wi + int nbg
                             if nwi < bfbtsz then nbuf.[nwi>>>5] <- nbuf.[nwi>>>5] ||| (1u<<<(nwi&&&0x1F)); nxtp' nwi ncnt
                             else let _,_,pq,bp,q,_,_ = ncnt in nbuf,pq,bp,q //results incl buf and cont parms for next page
      let npq,nbp,nq = addprms pq bp q
      return nxtp' 0 (0u,0,MinHeap.adjust (adjcinpg adj) npq,nbp,nq-adj,0u,0us) }
    rng,nxtp() |> Async.StartAsTask
  let nxtpg nxt (cont:(_*System.Threading.Tasks.Task<_>)[]) = //(len,pq,bp,q) =
    let adj = (cont |> Seq.fold (fun s (r,_)  -> s+r) 0u)
    let _,tsk = cont.[0] in let _,pq,bp,q = tsk.Result
    let ncont = Array.init (NUMPRCS+1) (fun i -> if i<NUMPRCS then cont.[i+1]
                                                 else prmspg (nxt+uint64 adj) adj pq bp q)
    let _,tsk = ncont.[0] in let nbuf,_,_,_ = tsk.Result in nbuf,ncont
  //init cond buf[0], no queue, frst bp sqr offset
  let initcond = 0u,System.Threading.Tasks.Task.Factory.StartNew (fun()->
                   (Array.empty,MinHeap.empty,baseprimes(),FSTPRM*FSTPRM-FSTPRM))
  let nxtcond n = prmspg (uint64 n) (n-FSTPRM) MinHeap.empty (baseprimes()) (FSTPRM*FSTPRM-FSTPRM)
  let initcont = Seq.unfold (fun (n,((r,_)as v))->Some(v,(n+r,nxtcond (n+r)))) (FSTPRM,initcond)
                 |> Seq.take (NUMPRCS+1) |> Seq.toArray
  let rec nxtprm (c,ci,i,buf:uint32[],cont) =
    let rec nxtprm' c ci i =
      let nc = c + uint64 WHLPTRN.[ci] in let nci = whladv ci in let ni = i + 1 in let nw = ni>>>5
      if nw >= buf.Length then let (npg,ncont)=nxtpg nc cont in nxtprm (c,ci,-1,npg,ncont)
      elif (buf.[nw] &&& (1u <<< (ni &&& 0x1F))) = 0u then nxtprm' nc nci ni
      else nc,nci,ni,buf,cont
    nxtprm' c ci i
  seq { yield! WHLPRMS |> Seq.map (uint64);
        yield! Seq.unfold (fun ((c,_,_,_,_) as cont)->Some(c,nxtprm cont))
                 (nxtprm (uint64 FSTPRM-uint64 WHLPTRN.[WHLLMT],WHLLMT,-1,Array.empty,initcont)) }

请注意,MinHeap模块(包括功能模块和基于阵列的模块)都进行了调整&#34;添加的函数允许在每个新段页面的开头调整每个线程的PQ版本的剔除状态。另请注意,可以调整代码,以便大多数计算使用32位范围完成,最终序列输出为uint64,计算时间成本很低,因此目前理论范围超过100兆如果一个人愿意等待大约三到四个月来计算这个范围,那么(十个提升到十四个权力)。删除了数值范围检查,因为不太可能有人使用此算法计算该范围,更不用说通过它了。

使用纯函数MinHeap和2,3,5,7轮分解,上述程序计算0.062,0.629,10.53和195.62秒的前十万,一百万,一千万和一亿个素数,分别。使用基于阵列的MinHeap分别可将速度提高到0.097,0.276,3.48和51.60秒。使用2,3,5,7,11,13,17车轮将WHLPRMS改为&#34; [| 2u; 3u; 5u; 7u; 11u; 13u; 17u |]&#34;和FSTPRM到19u分别加速到0.181,0.308,2.49和36.58秒(对于恒定的因子改进和恒定的开销)。这个最快的调整在大约88.37秒内计算32位数字范围内的203,280,221个素数。 &#34; BFSZ&#34;对于较大范围,可以通过较慢的时间之间的折衷来调整常数,对于较大的范围,可以通过较低的时间进行权衡,其值为&#34; 1 <&lt;&lt;&lt;&lt; 14&#14;建议尝试更大范围。此常量仅设置最小缓冲区大小,程序自动调整大小超过该大小的缓冲区大小,以便缓冲区足以使页面范围所需的最大基本素数始终为&#34; strike&#34;每页至少一次;这意味着额外的铲斗筛的复杂性和开销。不需要。这个最后一个完全优化的版本可以使用&#34; primesPQOWSE()|&gt;测试,在大约256.8和3617.4秒(1000亿只超过一小时)内计算高达10和1000亿的素数。 Seq.takeWhile((&gt; =)100000000000UL)|&gt; Seq.fold(有趣的s - > s + 1UL)0UL&#34;输出。在这里,估计大约半天的素数达到万亿,一周高达十万亿,大约三到四个月高达一百万亿来自。

我认为使用增量SoE算法生成功能性或功能性很强的代码可以比这更快地运行。正如在查看代码时可以看到的那样,优化基本增量算法大大增加了代码复杂性,使得它可能比基于直接数组剔除的等效优化代码稍微复杂一点,该代码能够运行大约十倍于代码。这个并且在性能上没有额外的指数意味着这个功能性增量代码具有不断增加的额外百分比开销。

除了有趣的理论和智力观点之外,这还有用吗?可能不是。对于低至大约一千万的较小范围的素数,基本的未完全优化的增量功能SoE的最佳可能是足够的并且非常简单地编写或具有比最简单的命令式SoE更少的RAM存储器使用。但是,它们比使用数组的更强制性代码要慢得多,因此它们不会出现问题。对于那个以上的范围。虽然这里已经证明代码可以通过优化来加速,但它仍然比基于阵列的更纯粹的版本慢了10倍,但复杂性至少要增加到那么复杂。具有等效优化的代码,甚至DotNet上F#下的代码比使用直接编译为本机代码的C ++等语言慢大约四倍;如果一个人真的想调查大范围的素数,人们可能会使用其中一种其他语言和技术{{3}}可以在四小时内计算百万亿范围内的素数而不是大约需要三个月这个代码。的 END_EDIT_ADD

答案 2 :(得分:6)

对于使用序列的Eratosthenes筛选的算法增量(和递归)映射,这是一个非常最优化的优化,因为不需要对先前序列值进行记忆(除了缓存基本素数值有一点点优势之外)使用Seq.cache),主要优化是它对输入序列使用轮分解,并且它使用多个(递归)流来维护小于最新数字的平方根的基本素数,如下所示:

  let primesMPWSE =
    let whlptrn = [| 2;4;2;4;6;2;6;4;2;4;6;6;2;6;4;2;6;4;6;8;4;2;4;2;
                     4;8;6;4;6;2;4;6;2;6;6;4;2;4;6;2;6;4;2;4;2;10;2;10 |]
    let adv i = if i < 47 then i + 1 else 0
    let reinsert oldcmpst mp (prime,pi) =
      let cmpst = oldcmpst + whlptrn.[pi] * prime
      match Map.tryFind cmpst mp with
        | None -> mp |> Map.add cmpst [(prime,adv pi)]
        | Some(facts) -> mp |> Map.add cmpst ((prime,adv pi)::facts)
    let rec mkprimes (n,i) m ps q =
      let nxt = n + whlptrn.[i]
      match Map.tryFind n m with
        | None -> if n < q then seq { yield (n,i); yield! mkprimes (nxt,adv i) m ps q }
                  else let (np,npi),nlst = Seq.head ps,ps |> Seq.skip 1
                       let (nhd,ni),nxtcmpst = Seq.head nlst,n + whlptrn.[npi] * np
                       mkprimes (nxt,adv i) (Map.add nxtcmpst [(np,adv npi)] m) nlst (nhd * nhd)
        | Some(skips) -> let adjmap = skips |> List.fold (reinsert n) (m |> Map.remove n)
                         mkprimes (nxt,adv i) adjmap ps q
    let rec prs = seq {yield (11,0); yield! mkprimes (13,1) Map.empty prs 121 } |> Seq.cache
    seq { yield 2; yield 3; yield 5; yield 7; yield! mkprimes (11,0) Map.empty prs 121 |> Seq.map (fun (p,i) -> p) }

在大约0.445秒内发现第100,000个素数高达1,299,721,但不是一个适当的势在必行的EoS算法,它不会随着素数的增加而线性增加,需要7.775秒才能找到1,000,000个素数在这个约为O(n ^ 1.2)的范围内表现为15,485,867,其中n是找到的最大素数。

可以进行更多的调整,但对于更好的性能,可能不会产生很大的差异,如下所示:

  1. 由于F#序列库非常慢,可以使用实现IEnumerable的自定义类型来减少内部序列所花费的时间,但由于序列操作仅占总时间的20%,即使这些减少到零时间,结果也只会减少到80%的时间。

  2. 可以尝试其他形式的地图存储,例如@gradbot使用的O&#39; Neil或SkewBinomialHeap所提到的优先级队列,但至少对于SkewBinomialHeap,性能的提升只是一个百分之几。在选择不同的地图实现时,似乎只是在查找和删除列表开头附近的项目时更好的响应与插入新条目所花费的时间以获得这些好处,因此净收益几乎是一个洗并且在地图中增加条目时仍具有O(log n)性能。上面使用多个条目流来优化平方根的优化减少了地图中条目的数量,从而使这些改进不太重要。

  3. EDIT_ADD:我确实做了一点额外的优化,而且性能确实比预期提高了一些,可能是由于改进了消除Seq.skip作为推进通过基本素数序列。该优化使用内部序列生成的替换作为整数值的元组和用于前进到序列中的下一个值的延续函数,其中最终的F#序列由整体展开操作生成。代码如下:

    type SeqDesc<'a> = SeqDesc of 'a * (unit -> SeqDesc<'a>) //a self referring tuple type
    let primesMPWSE =
      let whlptrn = [| 2;4;2;4;6;2;6;4;2;4;6;6;2;6;4;2;6;4;6;8;4;2;4;2;
                       4;8;6;4;6;2;4;6;2;6;6;4;2;4;6;2;6;4;2;4;2;10;2;10 |]
      let inline adv i = if i < 47 then i + 1 else 0
      let reinsert oldcmpst mp (prime,pi) =
        let cmpst = oldcmpst + whlptrn.[pi] * prime
        match Map.tryFind cmpst mp with
          | None -> mp |> Map.add cmpst [(prime,adv pi)]
          | Some(facts) -> mp |> Map.add cmpst ((prime,adv pi)::facts)
      let rec mkprimes (n,i) m (SeqDesc((np,npi),nsdf) as psd) q =
        let nxt = n + whlptrn.[i]
        match Map.tryFind n m with
          | None -> if n < q then SeqDesc((n,i),fun() -> mkprimes (nxt,adv i) m psd q)
                    else let (SeqDesc((nhd,x),ntl) as nsd),nxtcmpst = nsdf(),n + whlptrn.[npi] * np
                         mkprimes (nxt,adv i) (Map.add nxtcmpst [(np,adv npi)] m) nsd (nhd * nhd)
          | Some(skips) -> let adjdmap = skips |> List.fold (reinsert n) (m |> Map.remove n)
                           mkprimes (nxt,adv i) adjdmap psd q
      let rec prs = SeqDesc((11,0),fun() -> mkprimes (13,1) Map.empty prs 121 )
      let genseq sd = Seq.unfold (fun (SeqDesc((n,i),tailfunc)) -> Some(n,tailfunc())) sd
      seq { yield 2; yield 3; yield 5; yield 7; yield! mkprimes (11,0) Map.empty prs 121 |> genseq }
    

    找到第100,000和第1,000,000个素数所需的时间分别约为0.31和5.1秒,因此这一小变化可获得相当大的百分比增长。我确实尝试了我自己的IEnumerable / IEnumerator接口的实现,这些接口是序列的基础,尽管它们比Seq模块使用的版本更快,但它们几乎没有对这个算法做出任何进一步的改变,因为大部分时间花费在地图功能。的 END_EDIT_ADD

    除了基于地图的增量EoS实现之外,还有另一个&#34;纯功能&#34;使用树折叠的实现,据说稍微快一点,但因为它在树折叠中仍然有一个O(log n)项我怀疑它主要会更快(如果是)由于算法如何实现与使用地图相比,计算机操作的数量。如果有兴趣的话,我也会开发那个版本。

    最后,我们必须接受增量EoS的纯函数实现不会接近于较大数值范围的良好命令式实现的原始处理速度。然而,人们可以想出一种方法,其中所有代码都是纯函数的,除了使用(可变)数组在一个范围内对复合数字进行分段筛分,这将接近O(n)性能并且在实际使用中将是50比大范围的功能算法快一百倍,例如前200,000,000个素数。这已由@Jon Harrop在his blog中完成,但可以通过很少的额外代码进一步调整。

答案 3 :(得分:5)

这是我尝试将Haskell代码合理忠实地翻译为F#:

#r "FSharp.PowerPack"

module Map =
  let insertWith f k v m =
    let v = if Map.containsKey k m then f m.[k] v else v
    Map.add k v m

let sieve =
  let rec sieve' map = function
  | LazyList.Nil -> Seq.empty
  | LazyList.Cons(x,xs) -> 
      if Map.containsKey x map then
        let facts = map.[x]
        let map = Map.remove x map
        let reinsert m p = Map.insertWith (@) (x+p) [p] m
        sieve' (List.fold reinsert map facts) xs
      else
        seq {
          yield x
          yield! sieve' (Map.add (x*x) [x] map) xs
        }
  fun s -> sieve' Map.empty (LazyList.ofSeq s)

let rec upFrom i =
  seq {
    yield i
    yield! upFrom (i+1)
  }

let primes = sieve (upFrom 2)

答案 4 :(得分:5)

使用邮箱处理器实现Prime筛:

let (<--) (mb : MailboxProcessor<'a>) (message : 'a) = mb.Post(message)
let (<-->) (mb : MailboxProcessor<'a>) (f : AsyncReplyChannel<'b> -> 'a) = mb.PostAndAsyncReply f

type 'a seqMsg =  
    | Next of AsyncReplyChannel<'a>   

type PrimeSieve() =   
    let counter(init) =   
        MailboxProcessor.Start(fun inbox ->   
            let rec loop n =   
                async { let! msg = inbox.Receive()   
                        match msg with
                        | Next(reply) ->   
                            reply.Reply(n)   
                            return! loop(n + 1) }   
            loop init)   

    let filter(c : MailboxProcessor<'a seqMsg>, pred) =   
        MailboxProcessor.Start(fun inbox ->   
            let rec loop() =   
                async {   
                    let! msg = inbox.Receive()   
                    match msg with
                    | Next(reply) ->
                        let rec filter prime =
                            if pred prime then async { return prime }
                            else async {
                                let! next = c <--> Next
                                return! filter next }
                        let! next = c <--> Next
                        let! prime = filter next
                        reply.Reply(prime)
                        return! loop()   
                }   
            loop()   
        )   

    let processor = MailboxProcessor.Start(fun inbox ->   
        let rec loop (oldFilter : MailboxProcessor<int seqMsg>) prime =   
            async {   
                let! msg = inbox.Receive()   
                match msg with
                | Next(reply) ->   
                    reply.Reply(prime)   
                    let newFilter = filter(oldFilter, (fun x -> x % prime <> 0))   
                    let! newPrime = oldFilter <--> Next
                    return! loop newFilter newPrime   
            }   
        loop (counter(3)) 2)   

    member this.Next() = processor.PostAndReply( (fun reply -> Next(reply)), timeout = 2000)

    static member upto max =
        let p = PrimeSieve()
        Seq.initInfinite (fun _ -> p.Next())
        |> Seq.takeWhile (fun prime -> prime <= max)
        |> Seq.toList

答案 5 :(得分:3)

这是我的两分钱,虽然我不确定它是否符合OP的标准,真正成为eratosthenes的筛子。它不使用模块化划分,并从OP引用的论文中实现优化。它只适用于有限列表,但在我看来,这应该是最初描述筛子的精神。另外,论文还谈到了标记数量和分割数量方面的复杂性。似乎我们必须遍历链表,这可能忽略了性能术语中各种算法的一些关键方面。通常,虽然与计算机的模块化划分是昂贵的操作。

open System

let rec sieve list =
    let rec helper list2 prime next =
        match list2 with
            | number::tail -> 
                if number< next then
                    number::helper tail prime next
                else
                    if number = next then 
                        helper tail prime (next+prime)
                    else
                        helper (number::tail) prime (next+prime)

            | []->[]
    match list with
        | head::tail->
            head::sieve (helper tail head (head*head))
        | []->[]

let step1=sieve [2..100]

编辑:修复了原始帖子中的代码错误。我试着按照筛子的原始逻辑进行一些修改。即从第一个项目开始,并从集合中交叉该项目的倍数。该算法从字面上寻找下一个素数的倍数,而不是对集合中的每个数字进行模块化除法。本文的一个优化是它开始寻找大于p ^ 2的素数的倍数。

辅助函数中具有多级的部分处理可能已经从列表中删除下一个素数的可能性。因此,例如使用素数5,它将尝试删除数字30,但它永远不会找到它,因为它已经被素数移除3.希望澄清算法的逻辑。

答案 6 :(得分:3)

为了它的价值,这不是Eratosthenes的筛子,但它的非常快:

let is_prime n =
    let maxFactor = int64(sqrt(float n))
    let rec loop testPrime tog =
        if testPrime > maxFactor then true
        elif n % testPrime = 0L then false
        else loop (testPrime + tog) (6L - tog)
    if n = 2L || n = 3L || n = 5L then true
    elif n <= 1L || n % 2L = 0L || n % 3L = 0L || n % 5L = 0L then false
    else loop 7L 4L
let primes =
    seq {
        yield 2L;
        yield 3L;
        yield 5L;
        yield! (7L, 4L) |> Seq.unfold (fun (p, tog) -> Some(p, (p + tog, 6L - tog)))
    }
    |> Seq.filter is_prime

它在我的机器上发现了1.25秒内的第100,000个素数(AMD Phenom II,3.2GHZ四核)。

答案 7 :(得分:3)

我知道你明确表示你对纯粹的功能性筛子实施感兴趣,所以我暂时不去展示我的筛子。但是在重新阅读你引用的论文后,我看到所提出的增量筛分算法与我自己的基本相同,唯一的区别是使用纯功能技术与绝对命令技术的实现细节。所以我认为我至少有一半的资格满足你的好奇心。此外,我认为使用命令式技术可以实现显着的性能提升但被功能接口隐藏起来是F#编程中鼓励的最强大的技术之一,而不是纯粹的Haskell文化。我首先在我的Project Euler for F#un blog上发布了这个实现,但是在这里重新发布了先前替换的代码并删除了结构类型。 primes可以在我的计算机上计算0.248秒内的前100,000个素数和4.8秒内的前1,000,000个素数(注意primes缓存其结果,因此每次执行时都需要重新评估它基准)。

let inline infiniteRange start skip = 
    seq {
        let n = ref start
        while true do
            yield n.contents
            n.contents <- n.contents + skip
    }

///p is "prime", s=p*p, c is "multiplier", m=c*p
type SievePrime<'a> = {mutable c:'a ; p:'a ; mutable m:'a ; s:'a}

///A cached, infinite sequence of primes
let primes =
    let primeList = ResizeArray<_>()
    primeList.Add({c=3 ; p=3 ; m=9 ; s=9})

    //test whether n is composite, if not add it to the primeList and return false
    let isComposite n = 
        let rec loop i = 
            let sp = primeList.[i]
            while sp.m < n do
                sp.c <- sp.c+1
                sp.m <- sp.c*sp.p

            if sp.m = n then true
            elif i = (primeList.Count-1) || sp.s > n then
                primeList.Add({c=n ; p=n ; m=n*n ; s=n*n})
                false
            else loop (i+1)
        loop 0

    seq { 
        yield 2 ; yield 3

        //yield the cached results
        for i in 1..primeList.Count-1 do
            yield primeList.[i].p

        yield! infiniteRange (primeList.[primeList.Count-1].p + 2) 2 
               |> Seq.filter (isComposite>>not)
    }

答案 8 :(得分:3)

这是另一种仅使用纯函数F#代码完成Eratosthenes增量筛选(SoE)的方法。它改编自Haskell代码开发为&#34;这个想法归功于Dave Bayer,尽管他使用了复杂的公式和平衡的三元树结构,以统一的方式逐渐加深(简化的公式和偏斜,加深到正确的二进制)由Heinrich Apfelmus引入的树结构,由Will Ness进一步简化)。由于M. O&#39; Neill&#34;根据以下链接:Optimized Tree Folding code using a factorial wheel in Haskell

以下代码有几个优化,使其更适合在F#中执行,如下所示:

  1. 代码使用coinductive流而不是LazyList,因为这个算法没有(或者很少)需要LazyList的记忆,我的coinductive流比任何一个LazyLists(来自FSharp)都更有效。 PowerPack)或内置序列。另一个优点是我的代码可以在tryFSharp.orgideone.com上运行,而无需复制和粘贴LazyList类型和模块的Microsoft.FSharp.PowerPack核心源代码(以及版权声明)

  2. 发现F#在函数参数上的模式匹配存在相当多的开销,因此使用元组的先前更具可读性的区分联合类型被牺牲,有利于按值结构(或类)因为在某些平台上运行速度更快,所以类型可以加速两倍或更多。

  3. Ness的优化是从线性树折叠到双向折叠再到多向折叠,并且使用车轮分解的改进对于F#而言与Haskell一样有效,主要区别在于两种语言是Haskell可以编译为本机代码并具有更高度优化的编译器,而F#在DotNet Framework系统下运行的开销更大。

    type prmstate = struct val p:uint32 val pi:byte new (prm,pndx) = { p = prm; pi = pndx } end
    type prmsSeqDesc = struct val v:prmstate val cont:unit->prmsSeqDesc new(ps,np) = { v = ps; cont = np } end
    type cmpststate = struct val cv:uint32 val ci:byte val cp:uint32 new (strt,ndx,prm) = {cv = strt;ci = ndx;cp = prm} end
    type cmpstsSeqDesc = struct val v:cmpststate val cont:unit->cmpstsSeqDesc new (cs,nc) = { v = cs; cont = nc } end
    type allcmpsts = struct val v:cmpstsSeqDesc val cont:unit->allcmpsts new (csd,ncsdf) = { v=csd;cont=ncsdf } end
    
    let primesTFWSE =
      let whlptrn = [| 2uy;4uy;2uy;4uy;6uy;2uy;6uy;4uy;2uy;4uy;6uy;6uy;2uy;6uy;4uy;2uy;6uy;4uy;6uy;8uy;4uy;2uy;4uy;2uy;
                       4uy;8uy;6uy;4uy;6uy;2uy;4uy;6uy;2uy;6uy;6uy;4uy;2uy;4uy;6uy;2uy;6uy;4uy;2uy;4uy;2uy;10uy;2uy;10uy |]
      let inline whladv i = if i < 47uy then i + 1uy else 0uy
      let inline advmltpl c ci p = cmpststate (c + uint32 whlptrn.[int ci] * p,whladv ci,p)
      let rec pmltpls cs = cmpstsSeqDesc(cs,fun() -> pmltpls (advmltpl cs.cv cs.ci cs.cp))
      let rec allmltpls (psd:prmsSeqDesc) =
        allcmpsts(pmltpls (cmpststate(psd.v.p*psd.v.p,psd.v.pi,psd.v.p)),fun() -> allmltpls (psd.cont()))
      let rec (^) (xs:cmpstsSeqDesc) (ys:cmpstsSeqDesc) = //union op for SeqDesc's
        match compare xs.v.cv ys.v.cv with
          | -1 -> cmpstsSeqDesc (xs.v,fun() -> xs.cont() ^ ys)
          | 0 -> cmpstsSeqDesc (xs.v,fun() -> xs.cont() ^ ys.cont())
          | _ -> cmpstsSeqDesc(ys.v,fun() -> xs ^ ys.cont()) //must be greater than
      let rec pairs (csdsd:allcmpsts) =
        let ys = csdsd.cont in
        allcmpsts(cmpstsSeqDesc(csdsd.v.v,fun()->csdsd.v.cont()^ys().v),fun()->pairs (ys().cont()))
      let rec joinT3 (csdsd:allcmpsts) = cmpstsSeqDesc(csdsd.v.v,fun()->
        let ys = csdsd.cont() in let zs = ys.cont() in (csdsd.v.cont()^(ys.v^zs.v))^joinT3 (pairs (zs.cont())))
      let rec mkprimes (ps:prmstate) (csd:cmpstsSeqDesc) =
        let nxt = ps.p + uint32 whlptrn.[int ps.pi]
        if ps.p >= csd.v.cv then mkprimes (prmstate(nxt,whladv ps.pi)) (csd.cont()) //minus function
        else prmsSeqDesc(prmstate(ps.p,ps.pi),fun() -> mkprimes (prmstate(nxt,whladv ps.pi)) csd)
      let rec baseprimes = prmsSeqDesc(prmstate(11u,0uy),fun() -> mkprimes (prmstate(13u,1uy)) initcmpsts)
      and initcmpsts = joinT3 (allmltpls baseprimes)
      let genseq sd = Seq.unfold (fun (psd:prmsSeqDesc) -> Some(psd.v.p,psd.cont())) sd
      seq { yield 2u; yield 3u; yield 5u; yield 7u; yield! mkprimes (prmstate(11u,0uy)) initcmpsts |> genseq }
    
    primesLMWSE |> Seq.nth 100000
    
  4. EDIT_ADD:由于原始代码没有正确处理流的尾部并将参数流的尾部传递给joinT3函数的对函数而不是尾部跟踪,因此已经更正了zs流。因此,下面的时间也得到了纠正,加速了大约30%。 tryFSharp和ideone链接代码也已更正。的 END_EDIT_ADD

    当n是计算的素数时,上述程序在大约O(n ^ 1.1)性能下工作,其中n为最大素数计算或约为O(n ^ 1.18),并且计算第一百万个素数需要大约2.16秒(使用结构类型而不是类运行64位代码的快速计算机上的前100,000个素数大约0.14秒)(似乎某些实现在形成闭包时将按钮结构框拆箱)。我认为这是任何这些纯函数素数算法的最大实际范围。这可能是运行纯函数SoE算法的最快速度,而不是为了减少常数因素而进行的一些小调整。

    除了组合分段和多线程以在多个CPU内核之间共享计算外,大多数&#34;调整&#34;可以对此算法进行的是增加车轮分解的周长,性能增益高达约40%,并且由于对结构,类,元组或更直接的个别参数的使用进行调整而导致的微小增益功能之间的状态传递。

    EDIT_ADD2:我已经完成了上述优化,结果是由于结构优化,代码现在几乎快了两倍,并且可选择使用更大的车轮分解周长增加了减少量。注意,下面的代码避免在主序列生成循环中使用continuation,并且仅在必要时使用它们用于基本素数流和从这些基本素数导出的后续复合数量剔除流。新代码如下:

    type CIS<'T> = struct val v:'T val cont:unit->CIS<'T> new(v,cont) = { v=v;cont=cont } end //Co-Inductive Steam
    let primesTFOWSE =
      let WHLPRMS = [| 2u;3u;5u;7u |] in let FSTPRM = 11u in let WHLCRC = int (WHLPRMS |> Seq.fold (*) 1u) >>> 1
      let WHLLMT = int (WHLPRMS |> Seq.fold (fun o n->o*(n-1u)) 1u) - 1
      let WHLPTRN =
        let wp = Array.zeroCreate (WHLLMT+1)
        let gaps (a:int[]) = let rec gap i acc = if a.[i]=0 then gap (i+1) (acc+1uy) else acc
                             {0..WHLCRC-1} |> Seq.fold (fun s i->
                               let ns = if a.[i]<>0 then wp.[s]<-2uy*gap (i+1) 1uy;(s+1) else s in ns) 0 |> ignore
        Array.init (WHLCRC+1) (fun i->if WHLPRMS |> Seq.forall (fun p->(FSTPRM+uint32(i<<<1))%p<>0u)
                                      then 1 else 0) |> gaps;wp
      let inline whladv i = if i < WHLLMT then i+1 else 0 in let inline advcnd c ci = c + uint32 WHLPTRN.[ci]
      let inline advmltpl p (c,ci) = (c + uint32 WHLPTRN.[ci] * p,whladv ci)
      let rec pmltpls p cs = CIS(cs,fun() -> pmltpls p (advmltpl p cs))
      let rec allmltpls k wi (ps:CIS<_>) =
        let nxt = advcnd k wi in let nxti = whladv wi
        if k < ps.v then allmltpls nxt nxti ps
        else CIS(pmltpls ps.v (ps.v*ps.v,wi),fun() -> allmltpls nxt nxti (ps.cont()))
      let rec (^) (xs:CIS<uint32*_>) (ys:CIS<uint32*_>) = 
        match compare (fst xs.v) (fst ys.v) with //union op for composite CIS's (tuple of cmpst and wheel ndx)
          | -1 -> CIS(xs.v,fun() -> xs.cont() ^ ys)
          | 0 -> CIS(xs.v,fun() -> xs.cont() ^ ys.cont())
          | _ -> CIS(ys.v,fun() -> xs ^ ys.cont()) //must be greater than
      let rec pairs (xs:CIS<CIS<_>>) =
        let ys = xs.cont() in CIS(CIS(xs.v.v,fun()->xs.v.cont()^ys.v),fun()->pairs (ys.cont()))
      let rec joinT3 (xs:CIS<CIS<_>>) = CIS(xs.v.v,fun()->
        let ys = xs.cont() in let zs = ys.cont() in (xs.v.cont()^(ys.v^zs.v))^joinT3 (pairs (zs.cont())))
      let rec mkprm (cnd,cndi,(csd:CIS<uint32*_>)) =
        let nxt = advcnd cnd cndi in let nxti = whladv cndi
        if cnd >= fst csd.v then mkprm (nxt,nxti,csd.cont()) //minus function
        else (cnd,cndi,(nxt,nxti,csd))
      let rec pCIS p pi cont = CIS(p,fun()->let (np,npi,ncont)=mkprm cont in pCIS np npi ncont)
      let rec baseprimes() = CIS(FSTPRM,fun()->let np,npi = advcnd FSTPRM 0,whladv 0
                                               pCIS np npi (advcnd np npi,whladv npi,initcmpsts))
      and initcmpsts = joinT3 (allmltpls FSTPRM 0 (baseprimes()))
      let inline genseq sd = Seq.unfold (fun (p,pi,cont) -> Some(p,mkprm cont)) sd
      seq { yield! WHLPRMS; yield! mkprm (FSTPRM,0,initcmpsts) |> genseq }
    

    上述代码大约需要0.07,1.02和14.58秒才能枚举前十万,百万和千万个素数,这些都是在64位模式下参考Intel i7-2700K(3.5 GHz)机器上的。这并不比从中导出此代码的参考Haskell实现慢得多,尽管由于在Silverlight下为tryfsharp处于32位模式,它在tryfsharpideone上稍微慢了一些(关于再一次慢一点)并且在一个较慢的机器上运行在Mono 2.0(这对于F#来说本质上要慢很多),因此比参考机器慢了大约五倍。请注意,ideone报告的运行时间包括嵌入式查找表数组的初始化时间,需要考虑的时间。

    上述程序的另一个特点是分解轮被参数化,例如,通过将WHLPRMS设置为[|]可以使用极大的轮子。 2u; 3u; 5u; 7u; 11u; 13u; 17u; 19u |]和FSTPRM到23u,对于1000万个素数,大约10.02秒的大范围获得大约三分之二的运行时间,尽管注意它需要几秒钟在程序开始运行之前计算WHLPTRN。

    极客注意:我还没有实现一个非共享定点组合器,用于伸缩式多级素数生产&#34;根据参考Haskell代码,虽然我试图这样做,因为我需要有一些类似Haskell的懒惰列表,以便工作而不会逃避无限循环和堆栈溢出。尽管我的Co-Inductive Streams(CIS&#39; s)具有一些懒惰的属性,但它们不是正式的惰性列表或缓存序列(它们变成非缓存序列,并且在传递时可以缓存,因此诸如&#34之类的功能; genseq&#34;我提供的最终输出序列)。我不想使用LazyList的PowerPack实现,因为它不是非常有效,并且需要将该源代码复制到tryfsharp和ideone中,而不提供导入的模块。当想要使用此算法所需的头/尾操作时,使用内置序列(甚至是高速缓存)是非常低效的,因为获取序列尾部的唯一方法是使用&#34; Seq.skip 1& #34;在多次使用时会产生一个基于原始序列的新序列,递归地多次跳过。我可以基于CIS实现我自己的高效LazyList类,但是当递归&#34; initcmpsts&#34;时,似乎很难证明这一点。和&#34; baseprimes&#34;对象占用很少的代码。此外,将LazyList传递给函数以生成对该LazyList的扩展,该函数仅使用来自Lazylist开头附近的值,这要求几乎整个LazyList被记忆以降低内存效率:前1000万个素数的传递将需要内存中的LazyList,包含近1.8亿个元素。所以我接受了这个传球。

    请注意,对于较大的范围(1000万个素数或更多),这个纯函数代码的速度与Eratosthenes或Atkins的Sieve的许多简单命令式实现的速度大致相同,尽管这是由于缺乏对这些命令的优化算法;使用等效优化和分段筛分阵列比使用等同优化和分段筛分阵列更加必要的实现仍然比我的&#34;几乎功能性#34;答案。

    另请注意,虽然可以使用树折叠实现分段筛分,但由于剔除推进算法被隐藏在用于&#34; union的延续内部,因此更加困难。操作员和解决这个问题意味着需要使用不断推进的剔除范围;这与其他算法不同,其中可以为每个新页面重置剔除变量的状态,包括减小其范围,因此如果使用大于32位的范围,则内部剔除范围仍然可以重置为在32内操作即使在每个素数的执行时间内以很少的成本确定64位的素数范围时,比特范围也是如此。的 END_EDIT_ADD2

答案 9 :(得分:2)

实际上我试图做同样的事情,我首先尝试了相同的天真实现,但它太慢了。然后我找到了这个页面YAPES: Problem Seven, Part 2,在那里他使用了基于Melissa E. O'Neill的真正的Eratosthenes筛子。我从那里获取代码,只是对它进行了一些修改,因为F#自发布以来发生了一些变化。

let reinsert x table prime = 
   let comp = x+prime 
   match Map.tryFind comp table with 
   | None        -> table |> Map.add comp [prime] 
   | Some(facts) -> table |> Map.add comp (prime::facts) 

let rec sieve x table = 
  seq { 
    match Map.tryFind x table with 
    | None -> 
        yield x 
        yield! sieve (x+1I) (table |> Map.add (x*x) [x]) 
    | Some(factors) -> 
        yield! sieve (x+1I) (factors |> List.fold (reinsert x) (table |> Map.remove x)) } 

let primes = 
  sieve 2I Map.empty

primes |> Seq.takeWhile (fun elem -> elem < 2000000I) |> Seq.sum

答案 10 :(得分:2)

我不认为这个问题是完整的,只考虑纯粹的功能算法,在几个答案的情况下隐藏状态在地图或优先级队列中,或者在我的其他人的情况下隐藏折叠的合并树答案在于,由于它们的近似O(n ^ 1.2)性能,所以这些中的任何一个都对于大范围质数的可用性非常有限(&#39; ^&#39;意味着提升到其中n是最高数字的幂在序列中)以及每次剔除操作的计算开销。这意味着即使对于32位数字范围,这些算法也会花费至少几分钟的时间来生成高达40亿以上的素数,这不是非常有用的。

有几个答案提出了使用不同程度的可变性的解决方案,但它们要么没有充分利用它们的可变性而且效率低下,或者只是非常简单的命令式代码翻译和功能丑陋。在我看来,F#可变数组只是在数据结构中隐藏可变状态的另一种形式,并且可以开发一种有效的算法, 没有使用其他可变性 用于通过分页缓冲区段有效剔除复合数字的可变缓冲区数组,其余代码以纯函数式编写。

以下代码是在看到code by Jon Harrop后开发的,并对这些想法进行了改进,如下所示:

  1. Jon的代码在高内存使用方面失败(将所有生成的素数而不是素数保存到最高候选素数的平方根,并不断重新生成不断增大的大小的缓冲区数组(等于找到的最后一个素数的大小),与CPU缓存大小无关。

  2. 同样,他提供的代码不包括生成序列。

  3. 此外,所提供的代码没有至少只处理奇数的优化,更不用说不考虑使用轮分解了。

  4. 如果使用Jon的代码生成到40亿个数字范围40亿以上的质数范围,那么对于列表结构中保存的素数,它将具有千兆字节的内存需求。筛分缓冲器的千兆字节,虽然没有真正的原因,后者不能是固定的较小尺寸。一旦筛选缓冲区超过CPU缓存大小的大小,性能将在&#34;缓存抖动中迅速恶化,随着首先是L1,然后是L2,最后是L3(如果存在)的性能损失增加超出尺寸。

    这就是为什么Jon的代码在生成内存不足异常之前,即使在我的64位机器上也只计算大约2500万左右的素数,这也解释了原因。随着范围的增加,范围变得越来越大,相对性能下降越来越大,并且随着范围的增加,性能大约为O(n ^ 1.4),并且由于开始时具有如此低的计算复杂度而仅稍微节省了。

    以下代码解决了所有这些限制,因为它只记忆基本素数,直到根据需要计算的范围内最大数的平方根(在32位的情况下只有几千字节)对于每个基本素数生成器和主页分段筛选器(小于大多数现代CPU的L1高速缓存大小),仅使用大约16千字节的非常小的缓冲区,以及包括生成序列代码和(当前)稍微优化以仅筛选奇数,这意味着更有效地使用存储器。另外,使用打包位阵列来进一步提高存储效率;它的计算成本主要是为了在扫描缓冲区时需要进行较少的计算。

    let primesAPF32() =
      let rec oddprimes() =
        let BUFSZ = 1<<<17 in let buf = Array.zeroCreate (BUFSZ>>>5) in let BUFRNG = uint32 BUFSZ<<<1
        let inline testbit i = (buf.[i >>> 5] &&& (1u <<< (i &&& 0x1F))) = 0u
        let inline cullbit i = let w = i >>> 5 in buf.[w] <- buf.[w] ||| (1u <<< (i &&& 0x1F))
        let inline cullp p s low = let rec cull' i = if i < BUFSZ then cullbit i; cull' (i + int p)
                                   cull' (if s >= low then int((s - low) >>> 1)
                                          else let r = ((low - s) >>> 1) % p in if r = 0u then 0 else int(p - r))
        let inline cullpg low = //cull composites from whole buffer page for efficiency
          let max = low + BUFRNG - 1u in let max = if max < low then uint32(-1) else max
          let sqrtlm = uint32(sqrt(float max)) in let sqrtlmndx = int((sqrtlm - 3u) >>> 1)
          if low <= 3u then for i = 0 to sqrtlmndx do if testbit i then let p = uint32(i + i + 3) in cullp p (p * p) 3u
          else baseprimes |> Seq.skipWhile (fun p -> //force side effect of culling to limit of buffer
              let s = p * p in if p > 0xFFFFu || s > max then false else cullp p s low; true) |> Seq.nth 0 |> ignore
        let rec mkpi i low =
          if i >= BUFSZ then let nlow = low + BUFRNG in Array.fill buf 0 buf.Length 0u; cullpg nlow; mkpi 0 nlow
          else (if testbit i then i,low else mkpi (i + 1) low)
        cullpg 3u; Seq.unfold (fun (i,lw) -> //force cull the first buffer page then doit
            let ni,nlw = mkpi i lw in let p = nlw + (uint32 ni <<< 1)
            if p < lw then None else Some(p,(ni+1,nlw))) (0,3u)
      and baseprimes = oddprimes() |> Seq.cache
      seq { yield 2u; yield! oddprimes() }
    
    primesAPF32() |> Seq.nth 203280220 |> printfn "%A"
    

    这个新代码计算32位数字范围内的203,280,221个素数,大约 ADDED / CORRECTED:25.4秒,前100,000,100万,1000万的运行时间和在快速台式计算机(i7-2700K @ 3.5 GHz)上分别测试了1亿,分别为0.01,0.088,0.94和11.25秒,并且可以在tryfsharp.orgideone.com上运行,尽管由于执行时间的限制,后者的范围较小。它的性能比Jon Harrop的几千个素数的小范围更差,因为它增加了计算复杂度,但由于其更好的性能算法可以很快地将其传递给更大的范围这种复杂性使得第一千万次素数的速度提高了大约五倍,而在约翰逊的第二百万次传播之前,它的速度提高了大约七倍。

    在总执行时间中,超过一半用于基本序列枚举,因此在很大程度上不会通过将复合数量剔除操作作为后台操作来帮助,尽管组合中的车轮分解优化会有所帮助(虽然计算密集程度更高,但这种复杂性会在后台运行,因为它们减少了枚举所需的缓冲区测试操作的数量。如果序列的顺序不需要保留(如仅计算素数或总和素数),则可以进行进一步的优化,因为可以编写序列以支持并行枚举接口或代码可以写成一个类,以便成员方法可以在没有枚举的情况下进行计算。可以轻松调整此代码以提供与C#代码接近的相同性能,但更简洁地表达,尽管它永远不会达到优化的C ++本机代码(如PrimeSieve的性能,而{{3}}主要针对任务进行了优化只计算一个范围内的素数的数量,并且可以计算32位数范围内的素数的数量是一小部分(i7-2700K上的0.25秒)。

    因此,进一步的优化将集中在使用车轮分解进一步对筛分阵列进行打包以最小化在剔除复合数字时所做的工作,试图提高所得到的素数的枚举效率,并将所有复合剔除降级为背景线程,四到八个核心处理器可以隐藏所需的额外计算复杂性。

    它主要是纯函数代码,只是它使用一个可变数组来合并复合剔除....

答案 11 :(得分:2)

由于这个问题特别要求其他算法,我提供了以下实现:

  

或者可能知道替代实施方法或筛选算法

在没有提到Sieve of Atkin(SoA)的情况下,没有提交各种Eratosthenes Sieve(SoE)算法,这实际上是使用一组二次方程的解来实现复合的SoE变体剔除以及消除基本素数的所有多个方格(素数小于或等于为素数测试的最大数的平方根)。从理论上讲,SoA比SoE更有效,因为在该范围内的操作稍微减少,因此在大约10到1亿的范围内它应该具有大约20%的复杂度,但实际上它通常较慢,因为解决几个二次方程的复杂性的计算开销。虽然,高度优化的Daniel J. Bernstein's C implementation比他针对特定测试数范围测试的SoE实施更快,但他测试的SoE实现并不是最优化的,而且更多高度优化的直接SoE版本仍然更快。这似乎是这里的情况,尽管我承认可能还有一些我错过的优化。

由于O&#39; Neill在使用增量无界Sieves的SoE论文中主要表明Turner Sieve在算法和性能方面都不是SoE,她没有考虑SoE的许多其他变体比如SoA。快速搜索文献,我发现SoA没有应用到我们在这里讨论的无界增量序列中,所以我自己进行了修改,如下面的代码所示。

正如纯粹的SoE无界情况可以被视为具有素数倍数的无界序列的复合数,SoA认为具有二次方的所有表达式的无界序列的无界序列作为潜在素数。具有两个自由变量之一的方程式,&#39; x&#39;或者&#39; y&#39;固定为起始值并单独排除&#34;消除&#34;所有多个基本素数序列的序列,其最后与SoE序列的复合消除序列非常相似,只是序列通过素数的平方而不是(较小的)倍数更快地前进。素数。我试图减少二次方程序列的数量,这是为了识别为了增量筛的目的,&#34; 3 * x ^ 2 + y ^ 2&#34; &#34; 3 * x ^ 2 - y ^ 2&#34;序列实际上是相同的,除了第二项的符号并消除所有非奇数的解,以及应用2357轮分解(尽管SoA已经具有固有的235轮分解)。它使用有效的树折叠合并/组合算法,如在SoE树合并中处理每个序列序列,但简化了联合运算符在合并中不合并,因为SoA算法依赖于能够基于为特定值找到的二次解的数量。由于大约三倍的开销操作处理大约三倍于更复杂序列的数量,因此代码比树合并SoE慢,但是可能会有一系列非常大的数量的筛选,因为它会通过SoE它的理论性能优势。

以下代码适用于SoA的制定,使用CoInductive Stream类型而不是LazyList或序列,因为不需要记忆并且性能更好,也不使用判别联合并避免模式匹配表现原因:

#nowarn "40"
type cndstate = class val c:uint32 val wi:byte val md12:byte new(cnd,cndwi,mod12) = { c=cnd;wi=cndwi;md12=mod12 } end
type prmsCIS = class val p:uint32 val cont:unit->prmsCIS new(prm,nxtprmf) = { p=prm;cont=nxtprmf } end
type stateCIS<'b> = class val v:uint32 val a:'b val cont:unit->stateCIS<'b> new(curr,aux,cont)= { v=curr;a=aux;cont=cont } end
type allstateCIS<'b> = class val ss:stateCIS<'b> val cont:unit->allstateCIS<'b> new(sbstrm,cont) = { ss=sbstrm;cont=cont } end

let primesTFWSA() =
  let WHLPTRN = [| 2uy;4uy;2uy;4uy;6uy;2uy;6uy;4uy;2uy;4uy;6uy;6uy;2uy;6uy;4uy;2uy;6uy;4uy;6uy;8uy;4uy;2uy;4uy;2uy;
                   4uy;8uy;6uy;4uy;6uy;2uy;4uy;6uy;2uy;6uy;6uy;4uy;2uy;4uy;6uy;2uy;6uy;4uy;2uy;4uy;2uy;10uy;2uy;10uy |]
  let rec prmsqrs v sqr = stateCIS(v,sqr,fun() -> let n=v+sqr+sqr in let n=if n<v then 0xFFFFFFFFu else n in prmsqrs n sqr)
  let rec allsqrs (prms:prmsCIS) = let s = prms.p*prms.p in allstateCIS(prmsqrs s s,fun() -> allsqrs (prms.cont()))
  let rec qdrtc v y = stateCIS(v,y,fun() -> let a=(y+1)<<<2 in let a=if a<=0 then (if a<0 then -a else 2) else a
                                            let vn=v+uint32 a in let vn=if vn<v then 0xFFFFFFFFu else vn in qdrtc vn (y+2))
  let rec allqdrtcsX4 x = allstateCIS(qdrtc (((x*x)<<<2)+1u) 1,fun()->allqdrtcsX4 (x+1u))
  let rec allqdrtcsX3 x = allstateCIS(qdrtc (((x*(x+1u))<<<1)-1u) (1 - int x),fun() -> allqdrtcsX3 (x+1u))
  let rec joinT3 (ass:allstateCIS<'b>) = stateCIS<'b>(ass.ss.v,ass.ss.a,fun()->
    let rec (^) (xs:stateCIS<'b>) (ys:stateCIS<'b>) = //union op for CoInductiveStreams
      match compare xs.v ys.v with
        | 1 -> stateCIS(ys.v,ys.a,fun() -> xs ^ ys.cont())
        | _ -> stateCIS(xs.v,xs.a,fun() -> xs.cont() ^ ys) //<= then keep all the values without combining
    let rec pairs (ass:allstateCIS<'b>) =
      let ys = ass.cont
      allstateCIS(stateCIS(ass.ss.v,ass.ss.a,fun()->ass.ss.cont()^ys().ss),fun()->pairs (ys().cont()))
    let ys = ass.cont() in let zs = ys.cont() in (ass.ss.cont()^(ys.ss^zs.ss))^joinT3 (pairs (zs.cont())))
  let rec mkprm (cs:cndstate) (sqrs:stateCIS<_>) (qX4:stateCIS<_>) (qX3:stateCIS<_>) tgl =
    let inline advcnd (cs:cndstate) =
      let inline whladv i = if i < 47uy then i + 1uy else 0uy
      let inline modadv m a = let md = m + a in if md >= 12uy then md - 12uy else md
      let a = WHLPTRN.[int cs.wi] in let nc = cs.c+uint32 a
      if nc<cs.c then failwith "Tried to enumerate primes past the numeric range!!!"
      else cndstate(nc,whladv cs.wi,modadv cs.md12 a)
    if cs.c>=sqrs.v then mkprm (if cs.c=sqrs.v then advcnd cs else cs) (sqrs.cont()) qX4 qX3 false //squarefree function
    elif cs.c>qX4.v then mkprm cs sqrs (qX4.cont()) qX3 false
    elif cs.c>qX3.v then mkprm cs sqrs qX4 (qX3.cont()) false
    else match cs.md12 with
            | 7uy -> if cs.c=qX3.v then mkprm cs sqrs qX4 (qX3.cont()) (if qX3.a>0 then not tgl else tgl) //only for a's are positive
                     elif tgl then prmsCIS(cs.c,fun() -> mkprm (advcnd cs) sqrs qX4 qX3 false)
                     else mkprm (advcnd cs) sqrs qX4 qX3 false
            | 11uy -> if cs.c=qX3.v then mkprm cs sqrs qX4 (qX3.cont()) (if qX3.a<0 then not tgl else tgl) //only for a's are negatve
                      elif tgl then prmsCIS(cs.c,fun() -> mkprm (advcnd cs) sqrs qX4 qX3 false)
                      else mkprm (advcnd cs) sqrs qX4 qX3 false
            | _ -> if cs.c=qX4.v then mkprm cs sqrs (qX4.cont()) qX3 (not tgl) //always must be 1uy or 5uy
                   elif tgl then prmsCIS(cs.c,fun() -> mkprm (advcnd cs) sqrs qX4 qX3 false)
                   else mkprm (advcnd cs) sqrs qX4 qX3 false
  let qX4s = joinT3 (allqdrtcsX4 1u) in let qX3s = joinT3 (allqdrtcsX3 1u)
  let rec baseprimes = prmsCIS(11u,fun() -> mkprm (cndstate(13u,1uy,1uy)) initsqrs qX4s qX3s false)
  and initsqrs = joinT3 (allsqrs baseprimes)
  let genseq ps = Seq.unfold (fun (psd:prmsCIS) -> Some(psd.p,psd.cont())) ps
  seq { yield 2u; yield 3u; yield 5u; yield 7u;
        yield! mkprm (cndstate(11u,0uy,11uy)) initsqrs qX4s qX3s false |> genseq }

如上所述,代码比Tree Folding Wheel Optimized SoE慢,因为在前100,000个素数中,在另一个答案中发布了大约半秒,并且对于素数发现的性能具有大致相同的经验O(n ^ 1.2)最好的其他纯功能解决方案。可以尝试的一些进一步的优化是素数平方序列不使用轮分解来消除357个正方形的倍数,或者甚至仅使用素数正方形的素数倍来减少正方形序列流中的值的数量并且可能与二次方程表达序列流相关的其他优化。

EDIT_ADD:我花了一点时间来研究SoA模数优化,看看除了上面的&#34; squarefree&#34;优化,可能不会产生很大的不同,二次序列在每15个元素上具有模数模式,这将允许许多传递的切换复合测试值被预先筛选,并且将消除对特定模数的需要每个复合数字的12个操作。所有这些优化都可能导致提交到树折叠的计算工作减少高达约50%,以使稍微更优化的SoA版本运行接近或稍微好于合并SoE的最佳树折叠。我不知道何时可以抽出时间进行这几天的调查以确定结果。的 END_EDIT_ADD

EDIT_ADD2:在进行上述优化时,确实会将性能提高大约两倍,我明白为什么当前经验性能随着增加n而不如SoE:而SoE特别适用于树木折叠操作,因为第一个序列更密集,更经常重复,后来的序列密度更低,SoA&#34; 4X&#34;对于后来的序列,当它们被添加时,序列更密集,而#3; 3X&#34;序列开始时密度较低,当y接近零时它们变得更密集,然后再次变得不那么密集;这意味着呼叫/返回序列不像SoE那样保持在最小深度,但是该深度增加超过与数字范围成比例。使用折叠的解决方案并不漂亮,因为人们可以为密度随时间增加的序列实现左折叠,但仍然留下了&#34; 3X&#34;序列进行了很差的优化,正如打破&#34; 3X&#34;序列分为正面和负面。最简单的解决方案是将所有序列保存到Map中,这意味着访问时间将增加类似于范围的平方根的对数,但是对于比当前树折叠更大的数字范围,这将更好。的 END_EDIT_ADD2

虽然速度较慢,但​​我在此提出这个解决方案,以展示如何将代码演变为表达最初为F#中的纯函数代码设想的想法。它提供了在CoInductive Streams中使用continuation来实现懒惰而不使用Lazy类型的示例,实现(尾部)递归循环以避免任何可变性要求,通过递归调用来线程化累加器(tgl)以获得结果(数量)乘以二次方程&#34;敲击&#34;测试数字),将方程解为(懒惰)序列(或本例中的流)等等。

对于那些希望在没有基于Windows的开发系统的情况下进一步使用此代码的人,我也将其发布到tryfsharp.orgIdeone.com,尽管它在这两个平台上运行速度较慢, tryfsharp与本地客户端计算机的速度成正比,由于在Silverlight下运行而速度较慢,而且在Mono-project 2.0下运行在Linux服务器CPU上的Ideone,这在实现方面尤其是垃圾收集方面非常慢。

答案 12 :(得分:1)

我不太熟悉Haskell多图,但是F# Power Pack有一个HashMultiMap类,其xmldoc摘要是:“哈希表,默认情况下基于F#结构”哈希“和(=)函数。 table可以将单个键映射到多个绑定。“也许这对你有帮助吗?

答案 13 :(得分:1)

在Corei5上1秒内

2 * 10 ^ 6

let n = 2 * (pown 10 6)
let sieve = Array.append [|0;0|] [|2..n|]

let rec filterPrime p = 
    seq {for mul in (p*2)..p..n do 
            yield mul}
        |> Seq.iter (fun mul -> sieve.[mul] <- 0)

    let nextPrime = 
        seq { 
            for i in p+1..n do 
                if sieve.[i] <> 0 then 
                    yield sieve.[i]
        }
        |> Seq.tryHead

    match nextPrime with
        | None -> ()
        | Some np -> filterPrime np

filterPrime 2

let primes = sieve |> Seq.filter (fun x -> x <> 0)

答案 14 :(得分:0)

我已经提交了answer that is "Almost Functional",并且不想通过进一步的添加/改进来混淆它,所以我提交了这个答案,其中包括最大轮分解和多线程处理-在我看来,买电脑具有多线程(甚至智能手机都是多核的),运行单线程就像买一辆装有多缸发动机的汽车,然后只用一个就开火。

同样,下面的代码除了删除缓存缓冲区内容的变化以及与枚举有关的优化(如果使用的话)之外,大多数都起作用,这总是需要当前状态的想法(尽管这些细节被某些较慢的方法隐藏了)这样做,例如使用F#的内置seq-但速度很慢);代码如下:

/// F# version of the Wheel Factorized  Sieve of Eratosthenes...

/// This is a "combo" sieve where
///   it is fully wheel factorized by the primes of 2, 3, 5, and 7; then
///   pre-sieved by the pattern of the 11, 13, 17, and 19 primes...

/// This version is almost fully functional with no mutation used except for
/// the contents of the sieve buffer arrays on composite number culling/sieving.

module SoE =

  type private Prime = uint64 // JavaScript doesn't have anything bigger!
  type private PrimeNdx = int64
  type private BasePrimeRep = uint32

  let inline public prime n = uint64 n // match these convenience conversions
  let inline private primendx n = int64 n // with the types above!
  let inline private bprep n = uint32 n // with the types above!

  let private cPGSZBTS = (1 <<< 14) * 8 // sieve buffer size in bits = CPUL1CACHE - THIS SHOULD REALLY BE AN ARGUMENT!!!!

  type private BasePrimeRepArr = BasePrimeRep[]
  type private SieveBuffer = uint8[][] // multiple levels by residue index, by segment, by byte

  /// a Co-Inductive Stream (CIS) of an "infinite" non-memoized series...
  type private CIS<'T> = CIS of 'T * (unit -> CIS<'T>) //' apostrophe formatting adjustment

  /// lazy list (memoized) series of base prime page arrays...
  type private BasePrime = uint32
  type private BasePrimeRepArrs =
    BasePrimeRepArrs of BasePrimeRepArr * Option<Lazy<BasePrimeRepArrs>>

// constants and Look Up Tables to do with culling start address calculation...
  let private FRSTSVPRM = prime 23 // past the precull primes!
  let private WHLNDXCNST = primendx (FRSTSVPRM * (FRSTSVPRM - prime 1) >>> 1)
  let private WHLPRMS = [| prime 2; prime 3; prime 5; prime 7;
                           prime 11; prime 13; prime 17; prime 19 |]
  let private WHLHITS = 48 // (3 - 1) * (5 - 1) * (7 - 1)!
  let private WHLODDCRC = 105 // 3 * 5 * 7; no evens!
  let private WHLPTRNLEN = 11 * 13 * 17 * 19 // repeating pattern of pre-cull primes
  let private NUMPCULLPRMS = 4
  let private PCULLPRMREPS: BasePrimeRepArrs =
    BasePrimeRepArrs( [| uint32 (-1 <<< 6) + 44u; uint32 (-1 <<< 6) + 45u;
                         uint32 (-1 <<< 6) + 46u; uint32 (-1 <<< 6) + 47u |], None)
  // number of primes to a million minus number wheel prims; go sieving to 10^12
  let private NUMSTRTSBASEPRMS = 78498 + WHLPRMS.Length + 1 // +1 for end 0xFFFFFFFFu
  let private NUMSTRTSPRMS = (6542 - WHLPRMS.Length + 1) // enough for  65536 squared
  let private RESIDUES = [|
    23; 29; 31; 37; 41; 43; 47; 53; 59; 61; 67; 71;
    73; 79; 83; 89; 97; 101; 103; 107; 109; 113; 121; 127;
    131; 137; 139; 143; 149; 151; 157; 163; 167; 169; 173; 179;
    181; 187; 191; 193; 197; 199; 209; 211; 221; 223; 227; 229; 233 |]
  let private WHLNDXS = [|
    0; 0; 0; 1; 2; 2; 2; 3; 3; 4; 5; 5; 6; 6; 6;
    7; 7; 7; 8; 9; 9; 9; 10; 10; 11; 12; 12; 12; 13; 13;
    14; 14; 14; 15; 15; 15; 15; 16; 16; 17; 18; 18; 19; 20; 20;
    21; 21; 21; 21; 22; 22; 22; 23; 23; 24; 24; 24; 25; 26; 26;
    27; 27; 27; 28; 29; 29; 29; 30; 30; 30; 31; 31; 32; 33; 33;
    34; 34; 34; 35; 36; 36; 36; 37; 37; 38; 39; 39; 40; 41; 41;
    41; 41; 41; 42; 43; 43; 43; 43; 43; 44; 45; 45; 46; 47; 47; 48 |]
  let private WHLRNDUPS = [| // two rounds to avoid overflow; used in start address calcs...
      0; 3; 3; 3; 4; 7; 7; 7; 9; 9; 10; 12; 12; 15; 15;
      15; 18; 18; 18; 19; 22; 22; 22; 24; 24; 25; 28; 28; 28; 30;
      30; 33; 33; 33; 37; 37; 37; 37; 39; 39; 40; 42; 42; 43; 45;
      45; 49; 49; 49; 49; 52; 52; 52; 54; 54; 57; 57; 57; 58; 60;
      60; 63; 63; 63; 64; 67; 67; 67; 70; 70; 70; 72; 72; 73; 75;
      75; 78; 78; 78; 79; 82; 82; 82; 84; 84; 85; 87; 87; 88; 93;
      93; 93; 93; 93; 94; 99; 99; 99; 99; 99; 100; 102; 102; 103; 105;
      105; 108; 108; 108; 109; 112; 112; 112; 114; 114; 115; 117; 117; 120; 120;
      120; 123; 123; 123; 124; 127; 127; 127; 129; 129; 130; 133; 133; 133; 135;
      135; 138; 138; 138; 142; 142; 142; 142; 144; 144; 145; 147; 147; 148; 150;
      150; 154; 154; 154; 154; 157; 157; 157; 159; 159; 162; 162; 162; 163; 165;
      165; 168; 168; 168; 169; 172; 172; 172; 175; 175; 175; 177; 177; 178; 180;
      180; 183; 183; 183; 184; 187; 187; 187; 189; 189; 190; 192; 192; 193; 198;
      198; 198; 198; 198; 199; 204; 204; 204; 204; 204; 205; 207; 207; 208; 210; 210 |]
  /// LUT of relative cull start points given the residual bit plane index (outer index),
  /// and the combination of the base prime residual index and the bit plane index of
  /// the first cull position for the page (multiply combined for the inner index), giving
  /// a 16-bit value which contains the multipier (the upper byte) and the extra
  /// cull index offset (the lower byte) used to multiply by the base prime wheel index
  /// and add the offset with the result added with the start wheel index to obtain
  /// the residual bit plane start wheel index...
  /// for "PG11", these arrays get huge (quarter meg elements with elements of 4 bytes for
  /// a megabyte size), which will partially (or entirely) cancell out the benefit for
  /// smaller prime ranges; may help for the huge prime ranges...
  let private WHLSTRTS: uint16[][] =
    let arr = Array.init WHLHITS <| fun _ -> Array.zeroCreate (WHLHITS * WHLHITS)
    for pi = 0 to WHLHITS - 1 do
      let mltsarr = Array.zeroCreate WHLHITS
      let p = RESIDUES.[pi] in let s = (p * p - int FRSTSVPRM) >>> 1 
      // build array of relative mults and offsets to `s`...
      { 0 .. WHLHITS - 1 } |> Seq.iter (fun ci ->
        let rmlt0 = (RESIDUES.[(pi + ci) % WHLHITS] - RESIDUES.[pi]) >>> 1
        let rmlt = rmlt0 + if rmlt0 < 0 then WHLODDCRC else 0 in let sn = s + p * rmlt
        let snd = sn / WHLODDCRC in let snm = sn - snd * WHLODDCRC
        mltsarr.[WHLNDXS.[snm]] <- rmlt) // new rmlts 0..209!
      let ondx = pi * WHLHITS
      { 0 .. WHLHITS - 1 } |> Seq.iter (fun si ->
        let s0 = (RESIDUES.[si] - int FRSTSVPRM) >>> 1 in let sm0 = mltsarr.[si]
        { 0 .. WHLHITS - 1 } |> Seq.iter (fun ci ->
          let smr = mltsarr.[ci]
          let rmlt = if smr < sm0 then smr + WHLODDCRC - sm0 else smr - sm0
          let sn = s0 + p * rmlt in let rofs = sn / WHLODDCRC
          // we take the multiplier times 2 so it multiplies by the odd wheel index...
          arr.[ci].[ondx + si] <- (rmlt <<< 9) ||| rofs |> uint16))
    arr

  let private makeSieveBuffer btsz: SieveBuffer =
    let sz = ((btsz + 31) >>> 5) <<< 2 // rounded up to nearest 32 bit boundary
    { 1 .. WHLHITS } |> Seq.map (fun _ -> Array.zeroCreate sz) |> Array.ofSeq

  // a dedicated BITMSK LUT may be faster than bit twiddling...
  let private BITMSK = [| 1uy; 2uy; 4uy; 8uy; 16uy; 32uy; 64uy; 128uy |]

  /// all the heavy lifting work is done here...
  let private cullSieveBuffer (lwi: PrimeNdx) (bpras: BasePrimeRepArrs)
                              (strtsa: uint32[]) (sb: SieveBuffer) =
    let sz = sb.[0].Length in let szbits = sz <<< 3 in let bplmt = sz >>> 4
    let lowndx = lwi * primendx WHLODDCRC
    let nxti = (lwi + primendx szbits) * primendx WHLODDCRC
    // set up strtsa for use by each modulo residue bit plane...
    let rec looppi ((BasePrimeRepArrs(bpra, bprastl)) as obpras) pi j =
      if pi < bpra.Length then
        let ndxdprm = bpra.[pi] in let rsd = RESIDUES.[int ndxdprm &&& 0x3F]
        let bp = (int ndxdprm >>> 6) * (WHLODDCRC <<< 1) + rsd
        let i = (bp - int FRSTSVPRM) >>> 1 |> primendx
        let s = (i + i) * (i + primendx FRSTSVPRM) + WHLNDXCNST
        if s >= nxti then strtsa.[j] <- 0xFFFFFFFFu else // enough base primes!
          let si = if s >= lowndx then int (s - lowndx) else
                    let wp = (rsd - int FRSTSVPRM) >>> 1
                    let r = (lowndx - s) %
                              (primendx bp * primendx WHLODDCRC) |> int
                    if r = 0 then 0 else
                    bp * (WHLRNDUPS.[wp + (int r + bp - 1) / bp] - wp) - r
          let sd = si / WHLODDCRC in let sn = WHLNDXS.[si - sd * WHLODDCRC]
          strtsa.[j] <- (uint32 sn <<< 26) ||| uint32 sd
          looppi obpras (pi + 1) (j + 1)
      else match bprastl with
           | None -> ()
           | Some lv -> looppi lv.Value 0 j       
    looppi bpras 0 0
    // do the culling based on the preparation...
    let rec loopri ri =
      if ri < WHLHITS then
        let pln = sb.[ri] in let plnstrts = WHLSTRTS.[ri]
        let rec looppi (BasePrimeRepArrs(bpra, bprastl) as obpras) pi =
          if pi < bpra.Length then
            let prmstrt = strtsa.[pi]
            if prmstrt < 0xFFFFFFFFu then
              let ndxdprm = bpra.[pi]
              let pd = int ndxdprm >>> 6 in let prmndx = int ndxdprm &&& 0x3F
              let bp = pd * (WHLODDCRC <<< 1) + RESIDUES.[prmndx]
              let sd = int prmstrt &&& 0x3FFFFFF in let sn = int (prmstrt >>> 26)
              let adji = prmndx * WHLHITS + sn in let adj = plnstrts.[adji]
              let s0 = sd + int (adj >>> 8) * pd + (int adj &&& 0xFF)
              if bp < bplmt then
                let slmt = min szbits (s0 + (bp <<< 3))
                let rec loops s8 =
                  if s8 < slmt then
                    let msk = BITMSK.[s8 &&& 7]
                    let rec loopc c =
                      if c < pln.Length then pln.[c] <- pln.[c] ||| msk; loopc (c + bp)
                    loopc (s8 >>> 3); loops (s8 + bp) in loops s0
              else
                let rec loopsi si =
                  if si < szbits then
                    let w = si >>> 3 in pln.[w] <- pln.[w] ||| BITMSK.[si &&& 7]
                    loopsi (si + bp) in loopsi s0
              looppi obpras (pi + 1)
          else match bprastl with
               | None -> ()
               | Some lv -> looppi lv.Value 0        
        looppi bpras 0; loopri (ri + 1) in loopri 0

  /// pre-culled wheel pattern with a 131072 extra size to avoid overflow...
  /// (copy by 16 Kilobytes per time!)
  let private WHLPTRN: SieveBuffer = // rounded up to next 32-bit alignmenet!
    let sb = makeSieveBuffer ((WHLPTRNLEN <<< 3) + 131072 + 31)
    let strtsa = Array.zeroCreate NUMPCULLPRMS
    cullSieveBuffer (primendx 0) PCULLPRMREPS strtsa sb; sb

  /// fill the SieveBuffer from the WHLPTRN according to the modulo of the low wheel index...
  let private fillSieveBuffer (lwi: PrimeNdx) (sb: SieveBuffer) =
    let len = sb.[0].Length in let cpysz = min len 16384 in let mdlo0 = lwi / (primendx 8)
    { 0 .. WHLHITS - 1 } |> Seq.iter (fun i ->
      { 0 .. 16384 .. len - 1 } |> Seq.iter (fun j ->
        let mdlo = (mdlo0 + primendx j) % (primendx WHLPTRNLEN) |> int
        Array.blit WHLPTRN.[i] mdlo sb.[i] j cpysz))

  /// fast value set bit count Look Up Table (CLUT) for 16-bit input...
  let private CLUT: uint8[] =
    let arr = Array.zeroCreate 65536
    let rec cntem i cnt = if i <= 0 then cnt else cntem (i &&& (i - 1)) (cnt + 1)
    for i = 0 to 65535 do arr.[i] <- cntem i 0 |> uint8
    arr

  /// count the zero (prime) bits in the SieveBuffer up to the "lsti" odd index...
  let private countSieveBuffer (bitlmt: int) (sb: SieveBuffer): int =
    let lstwi = bitlmt / WHLODDCRC
    let lstri = WHLNDXS.[bitlmt - lstwi * WHLODDCRC]
    let lst = (lstwi >>> 5) <<< 2 in let lstm = lstwi &&& 31
    let rec loopr ri cr =
      if ri >= WHLHITS then cr else
      let pln = sb.[ri]
      let rec cntem i cnt =
        if i >= lst then
          let msk = (0xFFFFFFFFu <<< lstm) <<< if ri <= lstri then 1 else 0
          let v = (uint32 pln.[lst] + (uint32 pln.[lst + 1] <<< 8) +
                   (uint32 pln.[lst + 2] <<< 16) + (uint32 pln.[lst + 3] <<< 24)) ||| msk
          cnt - int CLUT.[int v &&& 0xFFFF] - int CLUT.[int (v >>> 16)] else
        let v = uint32 pln.[i] + (uint32 pln.[i + 1] <<< 8) +
                (uint32 pln.[i + 2] <<< 16) + (uint32 pln.[i + 3] <<< 24)
        cntem (i + 4) (cnt - int CLUT.[int v &&& 0xFFFF] - int CLUT.[int (v >>> 16)])
      let cnti = cntem 0 cr in loopr (ri + 1) cnti
    loopr 0 ((lst * 8 + 32) * WHLHITS)

  /// it's rediculously easy to make this multi-threaded with the following change...
// (*
  /// a CIS series of pages from the given start index with the given SieveBuffer size,
  /// and provided with a polymorphic converter function to produce
  /// and type of result from the culled page parameters...
  let cNUMPROCS = System.Environment.ProcessorCount
  let rec private makePrimePages strtwi btsz strtsasz
                                 (cnvrtrf: PrimeNdx -> SieveBuffer -> 'T): CIS<'T> =
    let bpas = makeBasePrimes() in let tsks = Array.zeroCreate cNUMPROCS
    let sbs = Array.init cNUMPROCS (fun _ -> Array.zeroCreate (btsz >>> 3))
    let mktsk lwi i = System.Threading.Tasks.Task.Run(fun() ->
      let sb = makeSieveBuffer btsz in let strtsa = Array.zeroCreate strtsasz
      fillSieveBuffer lwi sb; cullSieveBuffer lwi bpas strtsa sb
      cnvrtrf lwi sb)
    let rec jobfeed lwi i =
      CIS(lwi, fun() -> let ni = i + 1
                        jobfeed (lwi + primendx btsz)
                                (if ni >= cNUMPROCS then 0 else ni))
    let rec strttsks (CIS(lwi, jbfdtlf) as jbfd) i =
      if i >= cNUMPROCS then jbfd else
      tsks.[i] <- mktsk lwi i; strttsks (jbfdtlf()) (i + 1)
    let rec mkpgrslt (CIS(lwi, jbfdtlf)) i =
      let rslt = tsks.[i].Result in tsks.[i] <- mktsk lwi i
      CIS(rslt,
          fun() -> mkpgrslt (jbfdtlf())
                            (if i >= cNUMPROCS - 1 then 0 else i + 1))
    mkpgrslt <| strttsks (jobfeed strtwi 0) 0 <| 0
// *)

  // the below is single threaded...
(*
  /// a CIS series of pages from the given start index with the given SieveBuffer size,
  /// and provided with a polymorphic converter function to produce
  /// and type of result from the culled page parameters...
  let rec private makePrimePages strtwi btsz strtsasz
                                 (cnvrtrf: PrimeNdx -> SieveBuffer -> 'T): CIS<'T> =
    let bpas = makeBasePrimes() in let sb = makeSieveBuffer btsz
    let strtsa = Array.zeroCreate strtsasz
    let rec nxtpg lwi =
      fillSieveBuffer lwi sb; cullSieveBuffer lwi bpas strtsa sb
      CIS(cnvrtrf lwi sb, fun() -> nxtpg (lwi + primendx btsz))
    nxtpg strtwi
// *)

  /// secondary feed of lazy list of memoized pages of base primes...
  and private makeBasePrimes(): BasePrimeRepArrs =
    let sb2bpa lwi (sb: SieveBuffer) =
      let btsz = sb.[0].Length <<< 3
      let arr =
        Array.zeroCreate <| countSieveBuffer ((btsz * WHLODDCRC) - 1) sb
      let rec loop ri i j =
        if i < btsz then
          if ri < WHLHITS then
            if sb.[ri].[i >>> 3] &&& BITMSK.[i &&& 7] <> 0uy then
              loop (ri + 1) i j
            else arr.[j] <- ((bprep lwi + bprep i) <<< 6) ||| bprep ri
                 loop (ri + 1) i (j + 1)
          else loop 0 (i + 1) j in loop 0 0 0; arr
    // finding the first page as not part of the loop and making succeeding
    // pages lazy breaks the recursive data race!
    let fksb = makeSieveBuffer 64 in fillSieveBuffer (primendx 0) fksb
    let fkbpra = sb2bpa (primendx 0) fksb
    let fkbpas = BasePrimeRepArrs(fkbpra, None)
    let strtsa = Array.zeroCreate (fkbpra.Length + 1)
    let frstsb = makeSieveBuffer 512 in fillSieveBuffer (primendx 0) frstsb
    cullSieveBuffer (primendx 0) fkbpas strtsa frstsb
    let rec nxtbpas (CIS(bpa, tlf)) =
      BasePrimeRepArrs(bpa, Some(lazy (nxtbpas (tlf()))))
    let restbpras =
      Some(lazy (nxtbpas <|
                   makePrimePages (primendx 512) 512 NUMSTRTSPRMS sb2bpa))
    let frstbpa = sb2bpa (primendx 0) frstsb
    BasePrimeRepArrs(frstbpa, restbpras)                    

  /// produces a generator of primes; uses mutability for better speed...
  let primes(): unit -> Prime =
    let sb2prmsarr lwi (sb: SieveBuffer) =
      let btsz = sb.[0].Length <<< 3
      let arr = Array.zeroCreate <| countSieveBuffer (btsz * WHLODDCRC - 1) sb
      let baseprm = prime (lwi + lwi) * prime WHLODDCRC
      let inline notprm ri i = sb.[ri].[i >>> 3] &&& BITMSK.[i &&& 7] <> 0uy
      let rec loop ri i j =
        if ri >= WHLHITS then loop 0 (i + 1) j else
        if i < btsz then
          if notprm ri i then loop (ri + 1) i j
          else arr.[j] <- baseprm + prime (i + i) * prime WHLODDCRC
                            + prime RESIDUES.[ri]
               loop (ri + 1) i (j + 1) in loop 0 0 0
      arr
    let mutable i = -WHLPRMS.Length
    let (CIS(nprms, npgtlf)) = // use page generator function above!
      makePrimePages (primendx 0) cPGSZBTS NUMSTRTSPRMS sb2prmsarr
    let mutable prmarr = nprms in let mutable pgtlf = npgtlf
    fun() -> 
      if i >= 0 && i < prmarr.Length then
        let oi = i in i <- i + 1; prmarr.[oi] else // ready next call!      
        if i < 0 then i <- i + 1; WHLPRMS.[7 + i] else
        let (CIS(nprms, npgtlf)) = pgtlf() // use page generator function above!
        i <- 1; prmarr <- nprms; pgtlf <- npgtlf; prmarr.[0]

  let countPrimesTo (limit: Prime): int64 = // much faster!
    let precnt = WHLPRMS |> Seq.takeWhile ((>=) limit) |> Seq.length |> int64
    if limit < FRSTSVPRM then precnt else
    let topndx = (limit - FRSTSVPRM) >>> 1 |> primendx
    let lmtlwi = topndx / primendx WHLODDCRC
    let sb2cnt lwi (sb: SieveBuffer) =
      let btsz = sb.[0].Length <<< 3 in let lmti = lwi + primendx (btsz - 1)
      countSieveBuffer
        (if lmti < lmtlwi then btsz * WHLODDCRC - 1
         else int (topndx - lwi * primendx WHLODDCRC)) sb |> int64, lmti
    let rec loop (CIS((cnt, nxti), tlf)) count =
      if nxti < lmtlwi then loop (tlf()) (count + cnt)
      else count + cnt
    loop <| makePrimePages (primendx 0) cPGSZBTS NUMSTRTSBASEPRMS sb2cnt <| precnt

open System
open SoE

[<EntryPoint>]
let main argv =
  let limit = prime 2000000000

  let frstprms = primes()
  printf "The first 23 primes are:  "
  for _ in 1 .. 25 do printf "%d " (frstprms())
  printfn ""

  let numprms = primes() in let mutable cnt = 0
  printf "Number of primes up to a million:  "
  while numprms() <= prime 1000000 do cnt <- cnt + 1
  printfn "%d" cnt

  let strt = DateTime.Now.Ticks

(*  // the slow way of enumerating and counting...
  let primegen = primes() in let mutable answr = 0
  while primegen() <= limit do answr <- answr + 1
// *)

  // the fast way of counting...
  let answr = countPrimesTo (prime 2000000000)

  let elpsd = (DateTime.Now.Ticks - strt) / 10000L

  printfn "Found %d primes up to %d in %d milliseconds" answr limit elpsd

  0 // return an integer exit code

在具有两个内核/四个线程的3.1 GHz旧Intel I3-2100上运行的输出是:

The first 23 primes are:  2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97
Number of primes up to a million:  78498
Found 98222287 primes to 2000000000 in 468 milliseconds

每个剔除操作大约需要5.8个CPU时钟周期(此范围内有十亿次剔除操作)。如果有更多的实际(非超线程)线程,更高的CPU时钟速率以及更新的“每时钟指令数”(IPC),则新一代的CPU将会成比例地提高速度。

这是关于达到此范围的DotNet代码的极限速度,但是对于超过170亿的更大范围,将剔除缓冲区大小调整为与要筛分的最大数字的平方根成比例的进一步改进将如果整个范围都被筛分了,而不仅仅是整个范围的较窄范围,则可以帮助保持速度,因为范围增加到需要几天,几周,几个月才能完成的巨大范围。

答案 15 :(得分:0)

对此线程进行了一些真正令人着迷和启发性的讨论,我知道该线程非常老,但是我想解决OP的原始问题。回想一下,它想要一个纯功能版本的Eratosthenes的Sieve。

@csrf

这具有已经讨论的缺陷。当然,最简单的纯函数式解决方案不使用突变,模运算(需要进行过多检查以剔除候选字符)会是这样吗?

let rec PseudoSieve list =
match list with
| hd::tl -> hd :: (PseudoSieve <| List.filter (fun x -> x % hd <> 0) tl)
| [] -> []

这显然不是为了提高最终性能和内存使用率,而检查let rec sieve primes = function | [] -> primes |> List.rev | p :: rest -> sieve (p :: primes) (rest |> List.except [p*p..p..n]) 的方式会很有趣-这样做的方式是仅执行一次,(这可能使它成为而不是实施Eratosthenes的Sieve,但是与天真的解决方案相比,它具有与OP中链接的论文中所述的相同的优点)-已实施,并且Big O的成本在那里。

我仍然认为这是对原始OP的最简洁答案。你觉得呢?

更新:使用List.p中的p * p使其成为合适的筛子!