Question

我想知道是否有人在这里有一个很好的实施他们想要分享的阿特金筛选。

我正在尝试实现它，但不能完全包围它。这是我到目前为止所拥有的。

public class Atkin : IEnumerable<ulong>
{
    private readonly List<ulong> primes;
    private readonly ulong limit;

    public Atkin(ulong limit)
    {
        this.limit = limit;
        primes = new List<ulong>();
    }

    private void FindPrimes()
    {
        var isPrime = new bool[limit + 1];
        var sqrt = Math.Sqrt(limit);

        for (ulong x = 1; x <= sqrt; x++)
            for (ulong y = 1; y <= sqrt; y++)
            {
                var n = 4*x*x + y*y;
                if (n <= limit && (n % 12 == 1 || n % 12 == 5))
                    isPrime[n] ^= true;

                n = 3*x*x + y*y;
                if (n <= limit && n % 12 == 7)
                    isPrime[n] ^= true;

                n = 3*x*x - y*y;
                if (x > y && n <= limit && n % 12 == 11)
                    isPrime[n] ^= true;
            }

        for (ulong n = 5; n <= sqrt; n++)
            if (isPrime[n])
                for (ulong k = n*n; k <= limit; k *= k)
                    isPrime[k] = false;

        primes.Add(2);
        primes.Add(3);
        for (ulong n = 5; n <= limit; n++)
            if (isPrime[n])
                primes.Add(n);
    }


    public IEnumerator<ulong> GetEnumerator()
    {
        if (!primes.Any())
            FindPrimes();


        foreach (var p in primes)
            yield return p;
    }


    IEnumerator IEnumerable.GetEnumerator()
    {
        return GetEnumerator();
    }
}

我几乎只是试图“翻译”Wikipedia中列出的伪代码，但它无法正常工作。所以要么我误解了某些东西，要么只是做错了什么。或者很可能两者......

列出我用作测试的前500个素数，我的实现在40号（或41？）处失败。

值在索引[40]上不同     预期：179
    但是：175

你是否能够找到我的错误，你是否有可以分享的实施，或两者兼而有之？

我使用的确切测试如下：

public abstract class AtkinTests
{
    [Test]
    public void GetEnumerator_FirstFiveHundredNumbers_AreCorrect()
    {
        var sequence = new Atkin(2000000);
        var actual = sequence.Take(500).ToArray();
        var expected = First500;

        CollectionAssert.AreEqual(expected, actual);
    }

    private static readonly ulong[] First500 = new ulong[]
        {
            2, 3, 5, 7, 11, 13, 17, ...
        };
}

Answer 1

此代码：

for (ulong k = n*n; k <= limit; k *= k)
  isPrime[k] = false;

似乎不是这个伪代码的忠实翻译：

is_prime(k) ← false, k ∈ {n², 2n², 3n², ..., limit}

你的代码看起来会运行n * n，n ^ 4，n ^ 8等，即每次平方而不是每次加n次方。试试这个：

ulong nSquared = n * n;
for (ulong k = nSquared; k <= limit; k += nSquared)
  isPrime[k] = false;

Answer 2

来自Atved Sieve（SoA）的翻译Python源代码的last answer by Aaron Mugatroyd并不是太糟糕，但它可以在几个方面得到改进，因为它错过了一些重要的优化，如下所示： / p>

他的回答并没有使用完整的modulo 60原版Atkin和Bernstein版本的Sieve，而是稍微改进了维基百科文章中伪代码的变体，因此使用的数值约为0.36倍筛范围组合切换/剔除操作;我的代码使用合理有效的非页面段伪代码，按照my comments in an answer commenting on the Sieve of Atkin使用大约0.26倍数值范围的因子，将工作量减少到大约三分之二左右。
他的代码通过仅具有奇数表示来减少缓冲区大小，而我的代码进一步比特包以消除可被3和5整除的数字的任何表示以及可由＆＃34;赔率仅＆＃34 ;;这将内存需求减少了近一半（至8/15）并有助于更好地利用CPU缓存，以便通过减少平均内存访问时间来进一步提高速度。
我的代码使用快速查找表（LUT）弹出计数技术计算素数的数量，与使用他使用的逐位技术的大约一秒相比，几乎没有时间计数;但是，在此示例代码中，即使从代码的定时部分取出的时间很短。
最后，我的代码优化了位操作操作，每个内部循环的代码最少。例如，它不使用连续的右移一个来产生奇数表示索引，实际上通过将所有内部循环写为常数模（等位位置）操作来实现小位移。同样，Aaron的翻译代码在操作中是非常低效的，例如在素数正方形自由剔除中它将素数的平方加到索引然后检查奇数结果而不是仅仅加上正方形的两倍而不是要求检查;然后，在内循环中执行剔除操作之前，通过将数字右移1（除以2）使得检查更加冗余，就像对所有循环一样。这个效率低下的代码使用这个大型筛分缓冲阵列不会对大范围的执行时间造成太大影响。技术，因为每次操作的大部分时间都用于RAM存储器访问（大约37个CPU时钟周期或更多，范围为10亿），但是会使执行时间比适合的较小范围要慢得多。 CPU缓存;换句话说，它为每次操作的执行速度设置了太高的最低限制。

代码如下：

//Sieve of Atkin based on full non page segmented modulo 60 implementation...

using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;

namespace NonPagedSoA {
  //implements the non-paged Sieve of Atkin (full modulo 60 version)...
  class SoA : IEnumerable<ulong> {
    private ushort[] buf = null;
    private long cnt = 0;
    private long opcnt = 0;
    private static byte[] modPRMS = { 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61 };
    private static ushort[] modLUT;
    private static byte[] cntLUT;
    //initialize the private LUT's...
    static SoA() {
      modLUT = new ushort[60];
      for (int i = 0, m = 0; i < modLUT.Length; ++i) {
        if ((i & 1) != 0 || (i + 7) % 3 == 0 || (i + 7) % 5 == 0) modLUT[i] = 0;
        else modLUT[i] = (ushort)(1 << (m++));
      }
      cntLUT = new byte[65536];
      for (int i = 0; i < cntLUT.Length; ++i) {
        var c = 0;
        for (int j = i; j > 0; j >>= 1) c += j & 1;
        cntLUT[i] = (byte)c;
      }
    }
    //initialization and all the work producing the prime bit array done in the constructor...
    public SoA(ulong range) {
      this.opcnt = 0;
      if (range < 7) {
        if (range > 1) {
          cnt = 1;
          if (range > 2) this.cnt += (long)(range - 1) / 2;
        }
        this.buf = new ushort[0];
      }
      else {
        this.cnt = 3;
        var nrng = range - 7; var lmtw = nrng / 60;
        //initialize sufficient wheels to non-prime
        this.buf = new ushort[lmtw + 1];

        //Put in candidate primes:
        //for the 4 * x ^ 2 + y ^ 2 quadratic solution toggles - all x odd y...
        ulong n = 6; // equivalent to 13 - 7 = 6...
        for (uint x = 1, y = 3; n <= nrng; n += (x << 3) + 4, ++x, y = 1) {
          var cb = n; if (x <= 1) n -= 8; //cancel the effect of skipping the first one...
          for (uint i = 0; i < 15 && cb <= range; cb += (y << 2) + 4, y += 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0)
              for (uint c = (uint)cbd, my = y + 15; c < buf.Length; c += my, my += 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
          }
        }
        //for the 3 * x ^ 2 + y ^ 2 quadratic solution toggles - x odd y even...
        n = 0; // equivalent to 7 - 7 = 0...
        for (uint x = 1, y = 2; n <= nrng; n += ((x + x + x) << 2) + 12, x += 2, y = 2) {
          var cb = n;
          for (var i = 0; i < 15 && cb <= range; cb += (y << 2) + 4, y += 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0)
              for (uint c = (uint)cbd, my = y + 15; c < buf.Length; c += my, my += 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
          }
        }
        //for the 3 * x ^ 2 - y ^ 2 quadratic solution toggles all x and opposite y = x - 1...
        n = 4; // equivalent to 11 - 7 = 4...
        for (uint x = 2, y = x - 1; n <= nrng; n += (ulong)(x << 2) + 4, y = x, ++x) {
          var cb = n; int i = 0;
          for ( ; y > 1 && i < 15 && cb <= nrng; cb += (ulong)(y << 2) - 4, y -= 2, ++i) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if (cm != 0) {
              uint c = (uint)cbd, my = y;
              for ( ; my >= 30 && c < buf.Length; c += my - 15, my -= 30) {
                buf[c] ^= cm; // ++this.opcnt;
              }
              if (my > 0 && c < buf.Length) { buf[c] ^= cm; /* ++this.opcnt; */ }
            }
          }
          if (y == 1 && i < 15) {
            var cbd = cb / 60; var cm = modLUT[cb % 60];
            if ((cm & 0x4822) != 0 && cbd < (ulong)buf.Length) { buf[cbd] ^= cm; /* ++this.opcnt; */ }
          }
        }

        //Eliminate squares of base primes, only for those on the wheel:
        for (uint i = 0, w = 0, pd = 0, pn = 0, msk = 1; w < this.buf.Length ; ++i) {
          uint p = pd + modPRMS[pn];
          ulong sqr = (ulong)p * (ulong)p; //to handle ranges above UInt32.MaxValue
          if (sqr > range) break;
          if ((this.buf[w] & msk) != 0) { //found base prime, square free it...
            ulong s = sqr - 7;
            for (int j = 0; s <= nrng && j < modPRMS.Length; s = sqr * modPRMS[j] - 7, ++j) {
              var cd = s / 60; var cm = (ushort)(modLUT[s % 60] ^ 0xFFFF);
              //may need ulong loop index for ranges larger than two billion
              //but buf length only good to about 2^31 * 60 = 120 million anyway,
              //even with large array setting and half that with 32-bit...
              for (ulong c = cd; c < (ulong)this.buf.Length; c += sqr) {
                this.buf[c] &= cm; // ++this.opcnt;
              }
            }
          }
          if (msk >= 0x8000) { msk = 1; pn = 0; ++w; pd += 60; }
          else { msk <<= 1; ++pn; }
        }

        //clear any overflow primes in the excess space in the last wheel/word:
        var ndx = nrng % 60; //clear any primes beyond the range
        for (; modLUT[ndx] == 0; --ndx) ;
        this.buf[lmtw] &= (ushort)((modLUT[ndx] << 1) - 1);
      }
    }

    //uses a fast pop count Look Up Table to return the total number of primes...
    public long Count {
      get {
        long cnt = this.cnt;
        for (int i = 0; i < this.buf.Length; ++i) cnt += cntLUT[this.buf[i]];
        return cnt;
      }
    }

    //returns the number of toggle/cull operations used to sieve the prime bit array...
    public long Ops {
      get {
        return this.opcnt;
      }
    }

    //generate the enumeration of primes...
    public IEnumerator<ulong> GetEnumerator() {
      yield return 2; yield return 3; yield return 5;
      ulong pd = 0;
      for (uint i = 0, w = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) {
        if ((this.buf[w] & msk) != 0) //found a prime bit...
          yield return pd + modPRMS[pn]; //add it to the list
        if (msk >= 0x8000) { msk = 1; pn = 0; ++w; pd += 60; }
        else { msk <<= 1; ++pn; }
      }
    }

    //required for the above enumeration...
    IEnumerator IEnumerable.GetEnumerator() {
      return this.GetEnumerator();
    }
  }

  class Program {
    static void Main(string[] args) {
      Console.WriteLine("This program calculates primes by a simple full version of the Sieve of Atkin.\r\n");

      const ulong n = 1000000000;

      var elpsd = -DateTime.Now.Ticks;

      var gen = new SoA(n);

      elpsd += DateTime.Now.Ticks;

      Console.WriteLine("{0} primes found to {1} using {2} operations in {3} milliseconds.", gen.Count, n, gen.Ops, elpsd / 10000);

      //Output prime list for testing...
      //Console.WriteLine();
      //foreach (var p in gen) {
      //  Console.Write(p + " ");
      //}
      //Console.WriteLine();

//Test options showing what one can do with the enumeration, although more slowly...
//      Console.WriteLine("\r\nThere are {0} primes with the last one {1} and the sum {2}.",gen.Count(),gen.Last(),gen.Sum(x => (long)x));

      Console.Write("\r\nPress any key to exit:");
      Console.ReadKey(true);
      Console.WriteLine();
    }
  }
}

此代码的运行速度约为Aaron代码的两倍（在i7-2700K（3.5 GHz）上使用64位或32位模式约2.7秒，缓冲区约为16.5兆字节，总计约为25.58亿切换/素数正方形自由剔除操作（可通过取消注释＆＃34; ++ this.opcnt＆＃34;语句显示）筛选范围为10亿，而5.4 / 6.2秒（32位/ 64） -bit）他的代码没有计数时间，几乎是内存使用量的两倍，使用大约0.359亿次组合切换/剔除操作来筛选多达10亿次。

虽然它比他最优化的天真赔率 - 仅实施非分页的Eratosthenes筛选（SoE）更快，不会使Atkin的筛子比Eratosthenes的Sieve更快，如果将上述SoA实现中使用的类似技术应用于SoE plus并使用最大轮分解，则SoE将与此速度大致相同。

分析：虽然完全优化的SoE的操作次数与筛选范围为10亿的SoA的操作次数大致相同，但这些非分页的主要瓶颈一旦筛选缓冲区大小超过CPU缓存大小（在一个时钟周期访问时为32千字节L1缓存，在大约四个时钟周期访问时间为256千字节L2缓存，在大约20个时钟周期访问时间为8兆字节三级缓存），实现就是内存访问i7），之后存储器访问可以超过一百个时钟周期。

现在，当人们将算法调整到页面分割时，两者都有大约8倍的内存访问速度提升因子，因此可以筛选出无法满足可用内存的范围。然而，随着筛选范围开始变得非常大，SoE继续超过SoA，因为难以实施＆＃34; primes square free＆＃34;算法的一部分是由于剔除扫描的巨大进步，迅速增长到页面缓冲区大小的数百倍。同样，也许更严重的是，它为记忆和/或计算密集，以计算每个值的新起始点＆＃39; x＆＃39;关于＆＃39; y＆＃39;的价值在每个页面缓冲区的最低表示中，随着范围的增长，与SoE相比，分页SoA的效率进一步大幅下降。

EDIT_ADD： Aaron Murgatroyd使用的仅有几率的SoE使用了大约10.26亿次剔除操作，筛选范围为10亿，所以操作次数是SoA的四倍，因此应该运行慢了四倍，但是这里实施的SoA有一个更复杂的内循环，特别是由于更高比例的赔率 - 只有SoE剔除在剔除扫描中的步幅比SoA的步幅短得多的天真几率 - 尽管筛选缓冲区大大超过CPU缓存大小（更好地利用缓存关联性），但SoE具有更好的平均内存访问时间。这就解释了为什么上述SoA的速度只有仅有几率的SoA的两倍，尽管它理论上似乎只能完成四分之一的工作。

如果使用类似的算法使用常数模内圈和上述SoA并实现相同的2/3/5轮分解，那么SoE会将剔除操作的数量减少到大约0.405亿次操作，所以比SoA多50％的操作，理论上运行速度比SoA慢一点，但可能会以大约相同的速度运行，因为对于这个＆＃34;天真的＆＃34; naive＆的平均步幅仍然比SoA略小一些＃34;大内存缓冲区使用。将车轮分解增加到2/3/5/7车轮意味着对于10亿的剔除范围，SoE剔除操作减少到大约0.314，并且可能使该版本的SoE以相同的速度运行该算法。

可以通过预先剔除2/3/5/7/11/13/17/19素数因子的筛子阵列（模式复制）来进一步使用轮子因子分解，执行时间几乎不需要任何费用对于10亿的筛选范围，将剔除操作的总数减少到大约2.51亿，并且即使对于这些大型内存缓冲版本，SoE也会比SoA的运行速度更快或速度更快，即使对于这些大型内存缓冲版本也是如此代码复杂度仍远低于上述代码。

因此，可以看出，SoE的操作次数可以从天真或甚至仅赔率或2/3/5车轮分解版本大大减少，使得操作次数与同时，由于内部循环不太复杂和内存访问效率更高，每个操作的时间实际上可能更少。的 END_EDIT_ADD

EDIT_ADD2：我在这里根据伪代码further down the answer as linked above使用类似的常数模/位位置技术为最内层循环添加SoE的代码和上面的SoA。尽管应用了高轮分解和预剔除，但是剔除操作的总数实际上小于SoA的组合切换/剔除操作直到筛分范围，因此代码比上述SoA复杂得多。大约20亿。代码如下：

EDIT_FINAL 更正了以下代码以及与之相关的评论 END_EDIT_FINAL

//Sieve of Eratosthenes based on maximum wheel factorization and pre-culling implementation... using System; using System.Collections; using System.Collections.Generic; namespace NonPagedSoE { //implements the non-paged Sieve of Eratosthenes (full modulo 210 version with preculling)... class SoE : IEnumerable<ulong> { private ushort[] buf = null; private long cnt = 0; private long opcnt = 0; private static byte[] basePRMS = { 2, 3, 5, 7, 11, 13, 17, 19 }; private static byte[] modPRMS = { 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, //positions + 23 97, 101, 103, 107, 109, 113, 121, 127, 131, 137, 139, 143, 149, 151, 157, 163, 167, 169, 173, 179, 181 ,187 ,191 ,193, 197, 199, 209, 211, 221, 223, 227, 229 }; private static byte[] gapsPRMS = { 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2, 6, 4, 6, 8, 4, 2, 4, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6, 6, 4, 2, 4, 6, 2, 6, 4, 2, 4, 2, 10, 2, 10, 2, 4, 2, 4 }; private static ulong[] modLUT; private static byte[] cntLUT; //initialize the private LUT's... static SoE() { modLUT = new ulong[210]; for (int i = 0, m = 0; i < modLUT.Length; ++i) { if ((i & 1) != 0 || (i + 23) % 3 == 0 || (i + 23) % 5 == 0 || (i + 23) % 7 == 0) modLUT[i] = 0; else modLUT[i] = 1UL << (m++); } cntLUT = new byte[65536]; for (int i = 0; i < cntLUT.Length; ++i) { var c = 0; for (int j = i ^ 0xFFFF; j > 0; j >>= 1) c += j & 1; //reverse logic; 0 is prime; 1 is composite cntLUT[i] = (byte)c; } } //initialization and all the work producing the prime bit array done in the constructor... public SoE(ulong range) { this.opcnt = 0; if (range < 23) { if (range > 1) { for (int i = 0; i < modPRMS.Length; ++i) if (modPRMS[i] <= range) this.cnt++; else break; } this.buf = new ushort[0]; } else { this.cnt = 8; var nrng = range - 23; var lmtw = nrng / 210; var lmtwt3 = lmtw * 3; //initialize sufficient wheels to prime this.buf = new ushort[lmtwt3 + 3]; //initial state of all zero's is all potential prime. //initialize array to account for preculling the primes of 11, 13, 17, and 19; //(2, 3, 5, and 7 already eliminated by the bit packing to residues). for (int pn = modPRMS.Length - 4; pn < modPRMS.Length; ++pn) { uint p = modPRMS[pn] - 210u; ulong pt3 = p * 3; ulong s = p * p - 23; ulong xrng = Math.Min(9699709, nrng); // only do for the repeating master pattern size ulong nwrds = (ulong)Math.Min(138567, this.buf.Length); for (int j = 0; s <= xrng && j < modPRMS.Length; s += p * gapsPRMS[(pn + j++) % 48]) { var sm = modLUT[s % 210]; var si = (sm < (1UL << 16)) ? 0UL : ((sm < (1UL << 32)) ? 1UL : 2UL); var cd = s / 210 * 3 + si; var cm = (ushort)(sm >> (int)(si << 4)); for (ulong c = cd; c < nwrds; c += pt3) { //tight culling loop for size of master pattern this.buf[c] |= cm; // ++this.opcnt; //reverse logic; mark composites with ones. } } } //Now copy the master pattern so it repeats across the main buffer, allow for overflow... for (long i = 138567; i < this.buf.Length; i += 138567) if (i + 138567 <= this.buf.Length) Array.Copy(this.buf, 0, this.buf, i, 138567); else Array.Copy(this.buf, 0, this.buf, i, this.buf.Length - i); //Eliminate all composites which are factors of base primes, only for those on the wheel: for (uint i = 0, w = 0, wi = 0, pd = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) { uint p = pd + modPRMS[pn]; ulong sqr = (ulong)p * (ulong)p; if (sqr > range) break; if ((this.buf[w] & msk) == 0) { //found base prime, mark its composites... ulong s = sqr - 23; ulong pt3 = p * 3; for (int j = 0; s <= nrng && j < modPRMS.Length; s += p * gapsPRMS[(pn + j++) % 48]) { var sm = modLUT[s % 210]; var si = (sm < (1UL << 16)) ? 0UL : ((sm < (1UL << 32)) ? 1UL : 2UL); var cd = s / 210 * 3 + si; var cm = (ushort)(sm >> (int)(si << 4)); for (ulong c = cd; c < (ulong)this.buf.Length; c += pt3) { //tight culling loop this.buf[c] |= cm; // ++this.opcnt; //reverse logic; mark composites with ones. } } } ++pn; if (msk >= 0x8000) { msk = 1; ++w; ++wi; if (wi == 3) { wi = 0; pn = 0; pd += 210; } } else msk <<= 1; } //clear any overflow primes in the excess space in the last wheel/word: var ndx = nrng % 210; //clear any primes beyond the range for (; modLUT[ndx] == 0; --ndx) ; var cmsk = (~(modLUT[ndx] - 1)) << 1; //force all bits above to be composite ones. this.buf[lmtwt3++] |= (ushort)cmsk; this.buf[lmtwt3++] |= (ushort)(cmsk >> 16); this.buf[lmtwt3] |= (ushort)(cmsk >> 32); } } //uses a fast pop count Look Up Table to return the total number of primes... public long Count { get { long cnt = this.cnt; for (int i = 0; i < this.buf.Length; ++i) cnt += cntLUT[this.buf[i]]; return cnt; } } //returns the number of cull operations used to sieve the prime bit array... public long Ops { get { return this.opcnt; } } //generate the enumeration of primes... public IEnumerator<ulong> GetEnumerator() { yield return 2; yield return 3; yield return 5; yield return 7; yield return 11; yield return 13; yield return 17; yield return 19; ulong pd = 0; for (uint i = 0, w = 0, wi = 0, pn = 0, msk = 1; w < this.buf.Length; ++i) { if ((this.buf[w] & msk) == 0) //found a prime bit... yield return pd + modPRMS[pn]; ++pn; if (msk >= 0x8000) { msk = 1; ++w; ++wi; if (wi == 3) { wi = 0; pn = 0; pd += 210; } } else msk <<= 1; } } //required for the above enumeration... IEnumerator IEnumerable.GetEnumerator() { return this.GetEnumerator(); } } class Program { static void Main(string[] args) { Console.WriteLine("This program calculates primes by a simple maximually wheel factorized version of the Sieve of Eratosthenes.\r\n"); const ulong n = 1000000000; var elpsd = -DateTime.Now.Ticks; var gen = new SoE(n); elpsd += DateTime.Now.Ticks; Console.WriteLine("{0} primes found to {1} using {2} operations in {3} milliseconds.", gen.Count, n, gen.Ops, elpsd / 10000); // Console.WriteLine(); // foreach (var p in gen) { // Console.Write(p + " "); // } // Console.WriteLine(); // Console.WriteLine("\r\nThere are {0} primes with the last one {1} and the sum {2}.",gen.Count(),gen.Last(),gen.Sum(x => (long)x)); Console.Write("\r\nPress any key to exit:"); Console.ReadKey(true); Console.WriteLine(); } } }

这段代码的实际运行速度比上面的SoA快了几个百分点，因为操作稍微少一点，这个大型阵列大小的主要瓶颈是十亿分之一的内存访问时间，如40到100以上CPU时钟周期取决于CPU和内存规格;这意味着代码优化（除了减少操作总数之外）是无效的，因为大部分时间花在等待内存访问上。无论如何，使用巨大的内存缓冲区并不是筛选大范围的最有效方法，使用具有相同最大轮分解的页面分割的SoE最多可提高8倍（这也是铺平了多处理方式）。

在实现页面分割和多处理时，与SoE相比，SoA实际上缺少超过40亿的范围，因为由于SoA的渐近复杂度降低而导致的任何增益都会被页面处理开销迅速消耗掉。与素数广场自由处理和计算更大数量的页面起始地址相关的因素;或者，人们通过将标记存储在RAM存储器中以大量的存储器消耗和访问这些标记存储结构的低效率来克服这一点。的 END_EDIT_ADD2

简而言之，与完全车轮分解的SoE相比，SoA并不是一个真正的实用筛子，因为正如渐近复杂度的增益开始使其在性能上接近完全优化的SoE，它开始由于相对存储器访问时间和页面分割复杂性的实际实现细节以及通常更复杂和难以编写而导致效率降低。在我看来，与SoE相比，它更像是一个有趣的智力概念和心理练习，而不是一个实用的筛子。

有一天，我会将这些技术应用于Eratosthenes的多线程页面分段筛，以及与Atkin和Bernstein＆＃34; primegen＆＃34;在＆＃39; C＆＃39;中实施SoA并且它将从水中吹出大约40亿甚至单线程的大范围，当我的i7（包括超线程的八个核心）上的多线程时，速度最多可提高到大约四个。

Answer 3

Here's另一个实现。它使用BitArray来节省内存。 Parallel.For需要.NET Framework 4。

static List<int> FindPrimesBySieveOfAtkins(int max)
{
//  var isPrime = new BitArray((int)max+1, false); 
//  Can't use BitArray because of threading issues.
    var isPrime = new bool[max + 1];
    var sqrt = (int)Math.Sqrt(max);

    Parallel.For(1, sqrt, x =>
    {
        var xx = x * x;
        for (int y = 1; y <= sqrt; y++)
        {
            var yy = y * y;
            var n = 4 * xx + yy;
            if (n <= max && (n % 12 == 1 || n % 12 == 5))
                isPrime[n] ^= true;

            n = 3 * xx + yy;
            if (n <= max && n % 12 == 7)
                isPrime[n] ^= true;

            n = 3 * xx - yy;
            if (x > y && n <= max && n % 12 == 11)
                isPrime[n] ^= true;
        }
    });

    var primes = new List<int>() { 2, 3 };
    for (int n = 5; n <= sqrt; n++)
    {
        if (isPrime[n])
        {
            primes.Add(n);
            int nn = n * n;
            for (int k = nn; k <= max; k += nn)
                isPrime[k] = false;
        }
    }

    for (int n = sqrt + 1; n <= max; n++)
        if (isPrime[n])
            primes.Add(n);

    return primes;
}

Answer 4

以下是Atkin Sieve的更快实现，我在这里从这个Python脚本中窃取了算法（我不相信算法）：

http://programmingpraxis.com/2010/02/19/sieve-of-atkin-improved/

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    // The block element type for the bit array, 
    // use any unsigned value. WARNING: UInt64 is 
    // slower even on x64 architectures.
    using BitArrayType = System.UInt32;

    // This should never be any bigger than 256 bits - leave as is.
    using BitsPerBlockType = System.Byte;

    // The prime data type, this can be any unsigned value, the limit
    // of this type determines the limit of Prime value that can be
    // found. WARNING: UInt64 is slower even on x64 architectures.
    using PrimeType = System.Int32;

    /// <summary>
    /// Calculates prime number using the Sieve of Eratosthenes method.
    /// </summary>
    /// <example>
    /// <code>
    ///     var lpPrimes = new Eratosthenes(1e7);
    ///     foreach (UInt32 luiPrime in lpPrimes)
    ///         Console.WriteLine(luiPrime);
    /// </example>
    public class Atkin : IEnumerable<PrimeType>
    {
        #region Constants

        /// <summary>
        /// Constant for number of bits per block, calculated based on size of BitArrayType.
        /// </summary>
        const BitsPerBlockType cbBitsPerBlock = sizeof(BitArrayType) * 8;

        #endregion

        #region Protected Locals

        /// <summary>
        /// The limit for the maximum prime value to find.
        /// </summary>
        protected readonly PrimeType mpLimit;

        /// <summary>
        /// The number of primes calculated or null if not calculated yet.
        /// </summary>
        protected PrimeType? mpCount = null;

        /// <summary>
        /// The current bit array where a set bit means
        /// the odd value at that location has been determined
        /// to not be prime.
        /// </summary>
        protected BitArrayType[] mbaOddPrime;

        #endregion

        #region Initialisation

        /// <summary>
        /// Create Sieve of Atkin generator.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        public Atkin(PrimeType limit)
        {
            // Check limit range
            if (limit > PrimeType.MaxValue - (PrimeType)Math.Sqrt(PrimeType.MaxValue))
                throw new ArgumentOutOfRangeException();

            mpLimit = limit;

            FindPrimes();
        }

        #endregion

        #region Private Methods

        /// <summary>
        /// Finds the prime number within range.
        /// </summary>
        private unsafe void FindPrimes()
        {
            // Allocate bit array.
            mbaOddPrime = new BitArrayType[(((mpLimit >> 1) + 1) / cbBitsPerBlock) + 1];

            PrimeType lpYLimit, lpN, lpXX3, lpXX4, lpDXX, lpDN, lpDXX4, lpXX, lpX, lpYY, lpMinY, lpS, lpK;

            fixed (BitArrayType* lpbOddPrime = &mbaOddPrime[0])
            {
                // n = 3x^2 + y^2 section
                lpXX3 = 3;
                for (lpDXX = 0; lpDXX < 12 * SQRT((mpLimit - 1) / 3); lpDXX += 24)
                {
                    lpXX3 += lpDXX;
                    lpYLimit = (12 * SQRT(mpLimit - lpXX3)) - 36;
                    lpN = lpXX3 + 16;

                    for (lpDN = -12; lpDN < lpYLimit + 1; lpDN += 72)
                    {
                        lpN += lpDN;
                        lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                            (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }

                    lpN = lpXX3 + 4;
                    for (lpDN = 12; lpDN < lpYLimit + 1; lpDN += 72)
                    {
                        lpN += lpDN;
                        lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                            (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }
                }

                //    # n = 4x^2 + y^2 section
                lpXX4 = 0;
                for (lpDXX4 = 4; lpDXX4 < 8 * SQRT((mpLimit - 1) / 4) + 4; lpDXX4 += 8)
                {
                    lpXX4 += lpDXX4;
                    lpN = lpXX4 + 1;

                    if ((lpXX4 % 3) != 0)
                    {
                        for (lpDN = 0; lpDN < (4 * SQRT(mpLimit - lpXX4)) - 3; lpDN += 8)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }
                    }
                    else
                    {
                        lpYLimit = (12 * SQRT(mpLimit - lpXX4)) - 36;
                        lpN = lpXX4 + 25;

                        for (lpDN = -24; lpDN < lpYLimit + 1; lpDN += 72)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }

                        lpN = lpXX4 + 1;
                        for (lpDN = 24; lpDN < lpYLimit + 1; lpDN += 72)
                        {
                            lpN += lpDN;
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                        }
                    }
                }

                //    # n = 3x^2 - y^2 section
                lpXX = 1;
                for (lpX = 3; lpX < SQRT(mpLimit / 2) + 1; lpX += 2)
                {
                    lpXX += 4 * lpX - 4;
                    lpN = 3 * lpXX;

                    if (lpN > mpLimit)
                    {
                        lpMinY = ((SQRT(lpN - mpLimit) >> 2) << 2);
                        lpYY = lpMinY * lpMinY;
                        lpN -= lpYY;
                        lpS = 4 * lpMinY + 4;
                    }
                    else
                        lpS = 4;

                    for (lpDN = lpS; lpDN < 4 * lpX; lpDN += 8)
                    {
                        lpN -= lpDN;
                        if (lpN <= mpLimit && lpN % 12 == 11)
                            lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock));
                    }
                }

                // xx = 0
                lpXX = 0;
                for (lpX = 2; lpX < SQRT(mpLimit / 2) + 1; lpX += 2)
                {
                    lpXX += 4*lpX - 4;
                    lpN = 3*lpXX;

                    if (lpN > mpLimit)
                    {
                        lpMinY = ((SQRT(lpN - mpLimit) >> 2) << 2) - 1;
                        lpYY = lpMinY * lpMinY;
                        lpN -= lpYY;
                        lpS = 4*lpMinY + 4;
                    }
                    else
                    {
                        lpN -= 1;
                        lpS = 0;
                    }

                    for (lpDN = lpS; lpDN < 4 * lpX; lpDN += 8)
                    {
                        lpN -= lpDN;
                        if (lpN <= mpLimit && lpN % 12 == 11)
                            lpbOddPrime[(lpN>>1) / cbBitsPerBlock] ^= 
                                (BitArrayType)((BitArrayType)1 << (int)((lpN>>1) % cbBitsPerBlock));
                    }
                }

                // # eliminate squares
                for (lpN = 5; lpN < SQRT(mpLimit) + 1; lpN += 2)
                    if ((lpbOddPrime[(lpN >> 1) / cbBitsPerBlock] & ((BitArrayType)1 << (int)((lpN >> 1) % cbBitsPerBlock))) != 0)
                        for (lpK = lpN * lpN; lpK < mpLimit; lpK += lpN * lpN)
                            if ((lpK & 1) == 1)
                                lpbOddPrime[(lpK >> 1) / cbBitsPerBlock] &=
                                    (BitArrayType)~((BitArrayType)1 << (int)((lpK >> 1) % cbBitsPerBlock));
            }
        }

        /// <summary>
        /// Calculates the truncated square root for a number.
        /// </summary>
        /// <param name="value">The value to get the square root for.</param>
        /// <returns>The truncated sqrt of the value.</returns>
        private unsafe PrimeType SQRT(PrimeType value)
        {
            return (PrimeType)Math.Sqrt(value);
        }

        /// <summary>
        /// Gets a bit value by index.
        /// </summary>
        /// <param name="bits">The blocks containing the bits.</param>
        /// <param name="index">The index of the bit.</param>
        /// <returns>True if bit is set, false if cleared.</returns>
        private bool GetBitSafe(BitArrayType[] bits, PrimeType index)
        {
            if ((index & 1) == 1)
                return (bits[(index >> 1) / cbBitsPerBlock] & ((BitArrayType)1 << (int)((index >> 1) % cbBitsPerBlock))) != 0;
            else
                return false;
        }

        #endregion

        #region Public Properties

        /// <summary>
        /// Get the limit for the maximum prime value to find.
        /// </summary>
        public PrimeType Limit
        {
            get
            {
                return mpLimit;
            }
        }

        /// <summary>
        /// Returns the number of primes found in the range.
        /// </summary>
        public PrimeType Count
        {
            get
            {
                if (!mpCount.HasValue)
                {
                    PrimeType lpCount = 0;
                    foreach (PrimeType liPrime in this) lpCount++;
                    mpCount = lpCount;
                }

                return mpCount.Value;
            }
        }

        /// <summary>
        /// Determines if a value in range is prime or not.
        /// </summary>
        /// <param name="test">The value to test for primality.</param>
        /// <returns>True if the value is prime, false otherwise.</returns>
        public bool this[PrimeType test]
        {
            get
            {
                if (test > mpLimit) throw new ArgumentOutOfRangeException();
                if (test <= 1) return false;
                if (test == 2) return true;
                if ((test & 1) == 0) return false;
                return !GetBitSafe(mbaOddPrime, test >> 1);
            }
        }

        #endregion

        #region Public Methods

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator of the primes.</returns>
        public IEnumerator<PrimeType> GetEnumerator()
        {
            //    return [2,3] + filter(primes.__getitem__, xrange(5,limit,2))

            // Two & Three always prime.
            yield return 2;
            yield return 3;

            // Start at first block, third MSB (5).
            int liBlock = 0;
            byte lbBit = 2;
            BitArrayType lbaCurrent = mbaOddPrime[0] >> lbBit;

            // For each value in range stepping in incrments of two for odd values.
            for (PrimeType lpN = 5; lpN <= mpLimit; lpN += 2)
            {
                // If current bit not set then value is prime.
                if ((lbaCurrent & 1) == 1)
                    yield return lpN;

                // Move to NSB.
                lbaCurrent >>= 1;

                // Increment bit value. 
                lbBit++;

                // If block is finished.
                if (lbBit == cbBitsPerBlock) 
                {
                    lbBit = 0;
                    lbaCurrent = mbaOddPrime[++liBlock];

                    //// Move to first bit of next block skipping full blocks.
                    while (lbaCurrent == 0)
                    {
                        lpN += ((PrimeType)cbBitsPerBlock) << 1;
                        if (lpN <= mpLimit)
                            lbaCurrent = mbaOddPrime[++liBlock];
                        else
                            break;
                    }
                }
            }
        }

        #endregion

        #region IEnumerable<PrimeType> Implementation

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns></returns>
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        #endregion
    }
}

它与我最优化的Eratosthenes筛选版本的速度接近，但它仍然慢了大约20％，可以在这里找到：

https://stackoverflow.com/a/9700790/738380

Answer 5

Heres mine，它使用一个名为CompartmentalisedParallel的类，它允许您执行并行for循环但控制线程数以便将索引分组。但是，由于线程问题，您需要在每次更改时锁定BitArray或为每个线程创建单独的BitArray，然后在最后将它们一起进行异或，第一个选项非常慢，因为锁的数量，第二个选项似乎对我来说更快！

using System;
using System.Collections;
using System.Collections.Generic;
using System.Threading.Tasks;

namespace PrimeGenerator
{
    public class Atkin : Primes
    {
        protected BitArray mbaPrimes;
        protected bool mbThreaded = true;

        public Atkin(int limit)
            : this(limit, true)
        {
        }

        public Atkin(int limit, bool threaded)
            : base(limit)
        {
            mbThreaded = threaded;
            if (mbaPrimes == null) FindPrimes();
        }

        public bool Threaded
        {
            get
            {
                return mbThreaded;
            }
        }

        public override IEnumerator<int> GetEnumerator()
        {
            yield return 2;
            yield return 3;
            for (int lsN = 5; lsN <= msLimit; lsN += 2)
                if (mbaPrimes[lsN]) yield return lsN;
        }

        private void FindPrimes()
        {
            mbaPrimes = new BitArray(msLimit + 1, false);

            int lsSQRT = (int)Math.Sqrt(msLimit);

            int[] lsSquares = new int[lsSQRT + 1];
            for (int lsN = 0; lsN <= lsSQRT; lsN++)
                lsSquares[lsN] = lsN * lsN;

            if (Threaded)
            {
                CompartmentalisedParallel.For<BitArray>(
                    1, lsSQRT + 1, new ParallelOptions(),
                    (start, finish) => { return new BitArray(msLimit + 1, false); },
                    (lsX, lsState, lbaLocal) =>
                    {
                        int lsX2 = lsSquares[lsX];

                        for (int lsY = 1; lsY <= lsSQRT; lsY++)
                        {
                            int lsY2 = lsSquares[lsY];

                            int lsN = 4 * lsX2 + lsY2;
                            if (lsN <= msLimit && (lsN % 12 == 1 || lsN % 12 == 5))
                                lbaLocal[lsN] ^= true;

                            lsN -= lsX2;
                            if (lsN <= msLimit && lsN % 12 == 7)
                                lbaLocal[lsN] ^= true;

                            if (lsX > lsY)
                            {
                                lsN -= lsY2 * 2;
                                if (lsN <= msLimit && lsN % 12 == 11)
                                    lbaLocal[lsN] ^= true;
                            }
                        }

                        return lbaLocal;
                    },
                    (lbaResult, start, finish) =>
                    {
                        lock (mbaPrimes) 
                            mbaPrimes.Xor(lbaResult);
                    },
                    -1
                );
            }
            else
            {
                for (int lsX = 1; lsX <= lsSQRT; lsX++)
                {
                    int lsX2 = lsSquares[lsX];

                    for (int lsY = 1; lsY <= lsSQRT; lsY++)
                    {
                        int lsY2 = lsSquares[lsY];

                        int lsN = 4 * lsX2 + lsY2;
                        if (lsN <= msLimit && (lsN % 12 == 1 || lsN % 12 == 5))
                            mbaPrimes[lsN] ^= true;

                        lsN -= lsX2;
                        if (lsN <= msLimit && lsN % 12 == 7)
                            mbaPrimes[lsN] ^= true;

                        if (lsX > lsY)
                        {
                            lsN -= lsY2 * 2;
                            if (lsN <= msLimit && lsN % 12 == 11)
                                mbaPrimes[lsN] ^= true;
                        }
                    }
                }
            }

            for (int lsN = 5; lsN < lsSQRT; lsN += 2)
                if (mbaPrimes[lsN])
                {
                    var lsS = lsSquares[lsN];
                    for (int lsK = lsS; lsK <= msLimit; lsK += lsS)
                        mbaPrimes[lsK] = false;
                }
        }
    }
}

和CompartmentalisedParallel类：

using System;
using System.Threading.Tasks;

namespace PrimeGenerator
{
    public static class CompartmentalisedParallel
    {
        #region Int

        private static int[] CalculateCompartments(int startInclusive, int endExclusive, ref int threads)
        {
            if (threads == 0) threads = 1;
            if (threads == -1) threads = Environment.ProcessorCount;
            if (threads > endExclusive - startInclusive) threads = endExclusive - startInclusive;

            int[] liThreadIndexes = new int[threads + 1];
            liThreadIndexes[threads] = endExclusive - 1;
            int liIndexesPerThread = (endExclusive - startInclusive) / threads;
            for (int liCount = 0; liCount < threads; liCount++)
                liThreadIndexes[liCount] = liCount * liIndexesPerThread;

            return liThreadIndexes;
        }

        public static void For<TLocal>(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Func<int, int, TLocal> localInit,
            Func<int, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, int, int> localFinally,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        TLocal llLocal = localInit(liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);

                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState, llLocal);

                        localFinally(llLocal, liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);
                    }
                );
            else
            {
                TLocal llLocal = localInit(startInclusive, endExclusive);
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null, llLocal);
                localFinally(llLocal, startInclusive, endExclusive);
            }
        }

        public static void For(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Action<int, ParallelLoopState> body,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState);
                    }
                );
            else
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null);
        }

        public static void For(
            int startInclusive, int endExclusive,
            ParallelOptions parallelOptions,
            Action<int> body,
            int threads)
        {
            int[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread) =>
                    {
                        for (int liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter);
                    }
                );
            else
                for (int liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter);
        }

        public static void For(
            int startInclusive, int endExclusive,
            Action<int, ParallelLoopState> body,
            int threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For(
            int startInclusive, int endExclusive,
            Action<int> body,
            int threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For<TLocal>(
            int startInclusive, int endExclusive,
            Func<int, int, TLocal> localInit,
            Func<int, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, int, int> localFinally,
            int threads)
        {
            For<TLocal>(startInclusive, endExclusive, new ParallelOptions(), localInit, body, localFinally, threads);
        }

        #endregion

        #region Long

        private static long[] CalculateCompartments(long startInclusive, long endExclusive, ref long threads)
        {
            if (threads == 0) threads = 1;
            if (threads == -1) threads = Environment.ProcessorCount;
            if (threads > endExclusive - startInclusive) threads = endExclusive - startInclusive;

            long[] liThreadIndexes = new long[threads + 1];
            liThreadIndexes[threads] = endExclusive - 1;
            long liIndexesPerThread = (endExclusive - startInclusive) / threads;
            for (long liCount = 0; liCount < threads; liCount++)
                liThreadIndexes[liCount] = liCount * liIndexesPerThread;

            return liThreadIndexes;
        }

        public static void For<TLocal>(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Func<long, long, TLocal> localInit,
            Func<long, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, long, long> localFinally,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        TLocal llLocal = localInit(liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);

                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState, llLocal);

                        localFinally(llLocal, liThreadIndexes[liThread], liThreadIndexes[liThread + 1]);
                    }
                );
            else
            {
                TLocal llLocal = localInit(startInclusive, endExclusive);
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null, llLocal);
                localFinally(llLocal, startInclusive, endExclusive);
            }
        }

        public static void For(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Action<long, ParallelLoopState> body,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread, lsState) =>
                    {
                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter, lsState);
                    }
                );
            else
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter, null);
        }

        public static void For(
            long startInclusive, long endExclusive,
            ParallelOptions parallelOptions,
            Action<long> body,
            long threads)
        {
            long[] liThreadIndexes = CalculateCompartments(startInclusive, endExclusive, ref threads);

            if (threads > 1)
                Parallel.For(
                    0, threads, parallelOptions,
                    (liThread) =>
                    {
                        for (long liCounter = liThreadIndexes[liThread]; liCounter < liThreadIndexes[liThread + 1]; liCounter++)
                            body(liCounter);
                    }
                );
            else
                for (long liCounter = startInclusive; liCounter < endExclusive; liCounter++)
                    body(liCounter);
        }

        public static void For(
            long startInclusive, long endExclusive,
            Action<long, ParallelLoopState> body,
            long threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For(
            long startInclusive, long endExclusive,
            Action<long> body,
            long threads)
        {
            For(startInclusive, endExclusive, new ParallelOptions(), body, threads);
        }

        public static void For<TLocal>(
            long startInclusive, long endExclusive,
            Func<long, long, TLocal> localInit,
            Func<long, ParallelLoopState, TLocal, TLocal> body,
            Action<TLocal, long, long> localFinally,
            long threads)
        {
            For<TLocal>(startInclusive, endExclusive, new ParallelOptions(), localInit, body, localFinally, threads);
        }

        #endregion
    }
}

Primes基类：

using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    public abstract class Primes : IEnumerable<int>
    {
        protected readonly int msLimit;

        public Primes(int limit)
        {
            msLimit = limit;
        }

        public int Limit
        {
            get
            {
                return msLimit;
            }
        }

        public int Count
        {
            get
            {
                int liCount = 0;
                foreach (int liPrime in this)
                    liCount++;
                return liCount;
            }
        }

        public abstract IEnumerator<int> GetEnumerator();

        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }
    }
}

通过执行以下操作来使用它：

    var lpPrimes = new Atkin(count, true);
    Console.WriteLine(lpPrimes.Count);
    Console.WriteLine(s.ElapsedMilliseconds);

然而，我发现Eratosthenes在所有情况下都更快，即使在Atkin的多线程模式下运行四核CPU：

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    public class Eratosthenes : Primes
    {
        protected BitArray mbaOddEliminated;

        public Eratosthenes(int limit)
            : base(limit)
        {
            if (mbaOddEliminated == null) FindPrimes();
        }

        public override IEnumerator<int> GetEnumerator()
        {
            yield return 2;
            for (int lsN = 3; lsN <= msLimit; lsN+=2)
                if (!mbaOddEliminated[lsN>>1]) yield return lsN;
        }

        private void FindPrimes()
        {
            mbaOddEliminated = new BitArray((msLimit>>1) + 1);
            int lsSQRT = (int)Math.Sqrt(msLimit);
            for (int lsN = 3; lsN < lsSQRT + 1; lsN += 2)
                if (!mbaOddEliminated[lsN>>1])
                    for (int lsM = lsN*lsN; lsM <= msLimit; lsM += lsN<<1)
                        mbaOddEliminated[lsM>>1] = true;
        }
    }
}

如果你让Atkin跑得更快，请告诉我！

Answer 6

使用定制的FixBitArrays和速度结果的不安全代码改进了Eratosthenes的Sieve，这比我之前的Eratosthenes算法快了大约225％，并且该类是独立的（这不是多线程的 - Eratosthenes与multi不兼容在AMD Phenom II X4 965处理器上我可以在9,261毫秒内计算Primes到1,000,000,000的限制：

using System;
using System.Collections;
using System.Collections.Generic;

namespace PrimeGenerator
{
    // The block element type for the bit array, 
    // use any unsigned value. WARNING: UInt64 is 
    // slower even on x64 architectures.
    using BitArrayType = System.UInt32;

    // This should never be any bigger than 256 bits - leave as is.
    using BitsPerBlockType = System.Byte;

    // The prime data type, this can be any unsigned value, the limit
    // of this type determines the limit of Prime value that can be
    // found. WARNING: UInt64 is slower even on x64 architectures.
    using PrimeType = System.UInt32;

    /// <summary>
    /// Calculates prime number using the Sieve of Eratosthenes method.
    /// </summary>
    /// <example>
    /// <code>
    ///     var lpPrimes = new Eratosthenes(1e7);
    ///     foreach (UInt32 luiPrime in lpPrimes)
    ///         Console.WriteLine(luiPrime);
    /// </example>
    public class Eratosthenes : IEnumerable<PrimeType>
    {
        #region Constants

        /// <summary>
        /// Constant for number of bits per block, calculated based on size of BitArrayType.
        /// </summary>
        const BitsPerBlockType cbBitsPerBlock = sizeof(BitArrayType) * 8;

        #endregion

        #region Protected Locals

        /// <summary>
        /// The limit for the maximum prime value to find.
        /// </summary>
        protected readonly PrimeType mpLimit;

        /// <summary>
        /// The current bit array where a set bit means
        /// the odd value at that location has been determined
        /// to not be prime.
        /// </summary>
        protected BitArrayType[] mbaOddNotPrime;

        #endregion

        #region Initialisation

        /// <summary>
        /// Create Sieve of Eratosthenes generator.
        /// </summary>
        /// <param name="limit">The limit for the maximum prime value to find.</param>
        public Eratosthenes(PrimeType limit)
        {
            // Check limit range
            if (limit > PrimeType.MaxValue - (PrimeType)Math.Sqrt(PrimeType.MaxValue))
                throw new ArgumentOutOfRangeException();

            mpLimit = limit;

            FindPrimes();
        }

        #endregion

        #region Private Methods

        /// <summary>
        /// Finds the prime number within range.
        /// </summary>
        private unsafe void FindPrimes()
        {
            // Allocate bit array.
            mbaOddNotPrime = new BitArrayType[(((mpLimit >> 1) + 1) / cbBitsPerBlock) + 1];

            // Cache Sqrt of limit.
            PrimeType lpSQRT = (PrimeType)Math.Sqrt(mpLimit);

            // Fix the bit array for pointer access
            fixed (BitArrayType* lpbOddNotPrime = &mbaOddNotPrime[0])
                // Scan primes up to lpSQRT
                for (PrimeType lpN = 3; lpN <= lpSQRT; lpN += 2)
                    // If the current bit value for index lpN is cleared (prime)
                    if (
                            (
                                lpbOddNotPrime[(lpN >> 1) / cbBitsPerBlock] & 
                                ((BitArrayType)1 << (BitsPerBlockType)((lpN >> 1) % cbBitsPerBlock))
                            ) == 0
                        )
                        // Leave it cleared (prime) and mark all multiples of lpN*2 from lpN*lpN as not prime
                        for (PrimeType lpM = lpN * lpN; lpM <= mpLimit; lpM += lpN << 1)
                            // Set as not prime
                            lpbOddNotPrime[(lpM >> 1) / cbBitsPerBlock] |= 
                                (BitArrayType)((BitArrayType)1 << (BitsPerBlockType)((lpM >> 1) % cbBitsPerBlock));
        }

        /// <summary>
        /// Gets a bit value by index.
        /// </summary>
        /// <param name="bits">The blocks containing the bits.</param>
        /// <param name="index">The index of the bit.</param>
        /// <returns>True if bit is set, false if cleared.</returns>
        private bool GetBitSafe(BitArrayType[] bits, PrimeType index)
        {
            return (bits[index / cbBitsPerBlock] & ((BitArrayType)1 << (BitsPerBlockType)(index % cbBitsPerBlock))) != 0;
        }

        #endregion

        #region Public Properties

        /// <summary>
        /// Get the limit for the maximum prime value to find.
        /// </summary>
        public PrimeType Limit
        {
            get
            {
                return mpLimit;
            }
        }

        /// <summary>
        /// Returns the number of primes found in the range.
        /// </summary>
        public PrimeType Count
        {
            get
            {
                PrimeType lptCount = 0;
                foreach (PrimeType liPrime in this)
                    lptCount++;
                return lptCount;
            }
        }

        /// <summary>
        /// Determines if a value in range is prime or not.
        /// </summary>
        /// <param name="test">The value to test for primality.</param>
        /// <returns>True if the value is prime, false otherwise.</returns>
        public bool this[PrimeType test]
        {
            get
            {
                if (test > mpLimit) throw new ArgumentOutOfRangeException();
                if (test <= 1) return false;
                if (test == 2) return true;
                if ((test & 1) == 0) return false;
                return !GetBitSafe(mbaOddNotPrime, test >> 1);
            }
        }

        #endregion

        #region Public Methods

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator of the primes.</returns>
        public IEnumerator<PrimeType> GetEnumerator()
        {
            // Two always prime.
            yield return 2;

            // Start at first block, second MSB.
            int liBlock = 0;
            byte lbBit = 1;
            BitArrayType lbaCurrent = mbaOddNotPrime[0] >> 1;

            // For each value in range stepping in incrments of two for odd values.
            for (PrimeType lpN = 3; lpN <= mpLimit; lpN += 2)
            {
                // If current bit not set then value is prime.
                if ((lbaCurrent & 1) == 0)
                    yield return lpN;

                // Move to NSB.
                lbaCurrent >>= 1;

                // Increment bit value.
                lbBit++;

                // If block is finished.
                if (lbBit == cbBitsPerBlock) 
                {
                    // Move to first bit of next block.
                    lbBit = 0;
                    liBlock++;
                    lbaCurrent = mbaOddNotPrime[liBlock];
                }
            }
        }

        #endregion

        #region IEnumerable<PrimeType> Implementation

        /// <summary>
        /// Gets the enumerator for the primes.
        /// </summary>
        /// <returns>The enumerator for the prime numbers.</returns>
        IEnumerator IEnumerable.GetEnumerator()
        {
            return GetEnumerator();
        }

        #endregion
    }
}

在1,000,000,000中找到的素数：在9,261 ms内找到50,847,534

C＃：Atkin筛选的实施

6 个答案: