我更改了代码链接here的第一个版本,以使第360行的循环(在下面的代码I中共享)并行运行。 为此我替换了字段\变量,其中结果由字段\变量字段保存,以便每个线程可以保存它们而不删除其他线程的结果。 另外,我替换了每个循环传递中使用的值的计算,它们不是取决于它们在最后一个循环传递中具有的值(因此我可以根据循环变量的值来计算它们) 。 我将在这里发布整个代码当然最小的例子只是10行更短但是错过了检查结果是否错误的任何可能性。没有使用特殊的编译器功能,只需要-fopenmp(在g ++下)作为参数。
//
// prime_sieve.c
//
// Copyright (C) July 2002, Tomás Oliveira e Silva
//
// e-mail: tos@ua.pt
// www: http://www.ieeta.pt/~tos
//
// Comparison of two simple (but serious) implementations of the segmented sieve of
// Eratosthenes. The second implementation can generate primes reasonably fast near
// 1e18 (using around 400Mbytes of memory).
//
// _implementation_=0 gives a classical segmented sieve
// _implementation_=1 gives a cache-friendly segmented sieve
//
// See timing results for the two implementations at the end.
//
// Main idea: use one linked list for each interval of the segmented sieve, putting in it
// the primes than have an odd multiple in that interval (but not in a previous interval);
// this allows a better utilization of the processor data caches, giving significant time
// savings (up to a factor of 6) when working near 1e18. The amount of memory used is
// approximately 8*pi(sqrt(N)), where N is the last number of the interval, and pi(x) is
// the usual prime counting function.
//
// Assumptions: pointers have 4 bytes, gcc compiler
//
//
// Released under the GNU general public license (version 2 or any later version); see the
// gpl.txt file (or the page http://www.gnu.org/licenses/gpl.html) for details.
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
//
// configuration specification
//
// default parameters optimized for integers near 1e18; see tables at the end
//
#define nthreads 2
#ifndef _sieve_bits_log2_
# define _sieve_bits_log2_ 19
#endif
#ifndef _bucket_size_log2_
# define _bucket_size_log2_ 10
#endif
#if _bucket_size_log2_ > 16
# error "_bucket_size_log2_ is too large"
#endif
//
// basic type definitions
//
typedef unsigned char u08;
typedef unsigned int u32;
typedef unsigned long long u64;
//
// memory allocation
//
static void *get_memory(u32 size)
{
size_t m;
m = (size_t)malloc(size + 255); // this assumes that sizeof(void *) = sizeof(size_t)
if((void *)m == NULL)
exit(1);
m = (m + (size_t)255) & ~(size_t)255;
return (void *)m; // pointer aligned on a 256 byte boundary
}
//
// count the number of zeros
//
static u32 count_zero_bits(u08 *addr,u32 size)
{
static u32 data[256];
u32 i,j;
if(data[1] == 0)
for(i = 0;i < 256;i++)
for(j = i ^ 255;j;j >>= 1)
if(j & 1)
data[i]++;
j = 0;
for(i = 0;i < size;i++)
j += data[(u32)addr[i] & 255];
return j;
}
//
// generation of the (small) primes used by the main sieve
//
#define number_of_small_primes 6541// number of primes below 2 ^ 19
static u32 small_primes[number_of_small_primes];
static u32 small_sieve[nthreads][1024];//65 536 bits
static u32 small_base[nthreads];
static void update_small_sieve(u32 th_id)
{
u32 i,j;
for(j = 0;j < 1024;j++)
small_sieve[th_id][j] = 0;
for(i = 0;i < number_of_small_primes;i++)
{
j = small_primes[i] * small_primes[i];
if(j >= small_base[th_id] + 65536)
break;
if(j < small_base[th_id])
{
j = small_base[th_id] / small_primes[i];
j *= small_primes[i];
if(j < small_base[th_id])
j += small_primes[i];
if((j & 1) == 0)
j += small_primes[i];
}
for(j = (j - small_base[th_id]) >> 1;j < 32768;j += small_primes[i])
small_sieve[th_id][j >> 5] |= 1 << (j & 31);
}
}
//
// main sieve
//
// the following structure is used to record the
// information required to sieve an interval
//
// the value of _bucket_size_log2_ should
// be small (and a multiple of the L1 or L2 data cache line size)
//
#define primes_per_bucket ((1 << (_bucket_size_log2_ - 3)) - 1)
typedef struct bucket
{
struct bucket *next; // pointer to next bucket
u32 count; // count of the number of primes in this bucket
struct
{
u32 p; // prime
u32 o; // the bit number of the first odd multiple (>= main_base) of the prime
}
data[primes_per_bucket];
}
bucket;
static u32 main_sieve[nthreads][1 << (_sieve_bits_log2_ - 5)];
static u64 main_limit; // wird nicht parallel geaendert
static bucket **main_lists[nthreads],*available_buckets[nthreads];
static u32 list_size_log2;
void more_buckets(int th_id) { u32 i,j; i = 1 << (20 - _bucket_size_log2_);
available_buckets[th_id] = (bucket *)get_memory(i * sizeof(bucket)); for(j = 0;j < i;j++)
available_buckets[th_id][j].next = (j < i - 1) ? &available_buckets[th_id][j + 1] : NULL; }
void new_bucket(u64 k,int th_id) { bucket *b; if(available_buckets[th_id] == NULL) more_buckets(th_id);
b = available_buckets[th_id]; available_buckets[th_id] = available_buckets[th_id]->next;
b->next = main_lists[th_id][k]; main_lists[th_id][k] = b; b->count = 0; }
static void init_main_sieve(const u64 main_base, const u32 th_id, u32 next_prime, const u32 current_list)
{
u64 t,end;
u32 i,j;
u32 k;
end = main_base + (u64)(2 << _sieve_bits_log2_);
if ( small_base[th_id] != (next_prime/65536) * 65536) {
small_base[th_id] = (next_prime/65536) * 65536;
update_small_sieve(th_id);
}
while((t = (u64)next_prime * (u64)next_prime) < end)
{
if(next_prime >= small_base[th_id] + 65536)
{
small_base[th_id] += 65536;
update_small_sieve(th_id);
}
// primes are (beside two) always odd so they have at least a distance of 2.
// you dont have to save information about even numbers, so divide distance by two.
i = (next_prime - small_base[th_id]) >> 1;
if((small_sieve[th_id][i >> 5] & (1 << (i & 31))) == 0)// is nextprime a prime?
{
if(t < main_base) // setze t auf das erste vielfache der Primzahl > main_base
{
t = main_base / (u64)next_prime;
t *= (u64)next_prime;
if(t < main_base)
t += (u64)next_prime;
if(((u32)t & 1) == 0)
t += (u64)next_prime;
}
i = (u32)((t - main_base) >> 1); // bit number
k = (current_list + (i >> _sieve_bits_log2_)) & ((1 << list_size_log2) - 1);
if(main_lists[th_id][k]->count == primes_per_bucket){
//#pragma omp critical
new_bucket(k, th_id);
}
j = main_lists[th_id][k]->count++;
main_lists[th_id][k]->data[j].p = next_prime;
main_lists[th_id][k]->data[j].o = i & ((1 << _sieve_bits_log2_) - 1);
}
// atomic add
next_prime += 2;
}
}
static void do_main_sieve(const u64 main_base, const u32 th_id, u32 next_prime, const u32 current_list)
{
bucket *b;
bucket *c;
u32 j,k;
u32 i,p,o;
init_main_sieve(main_base, th_id, next_prime, current_list);
for(i = 0;i < (1 << (_sieve_bits_log2_ - 5));i++)
main_sieve[th_id][i] = 0;
b = main_lists[th_id][current_list];
while(b != NULL)
{
for(i = 0;i < b->count;i++)
{
p = b->data[i].p;
for(o = b->data[i].o;o < (1 << _sieve_bits_log2_);o += p)
//finde das entsprechende u32 feld mit allen bits von o auser den 5 letzten
// und finde mit den letzten 5 bits von 0 die Stelle in dem u32 wert die du auf 1
// dh. vielfaches einer Zahl, setzt
main_sieve[th_id][o >> 5] |= 1 << (o & 31);
k = (current_list + (o >> _sieve_bits_log2_)) & ((1 << list_size_log2) - 1);
if(main_lists[th_id][k]->count == primes_per_bucket) {
//#pragma omp critical
new_bucket(k, th_id);
}
j = main_lists[th_id][k]->count++;
main_lists[th_id][k]->data[j].p = p;
main_lists[th_id][k]->data[j].o = o & ((1 << _sieve_bits_log2_) - 1);
}
c = b;
b = b->next;
c->next = available_buckets[th_id];
available_buckets[th_id] = c;
}
main_lists[th_id][current_list] = NULL;
#pragma omp critical
new_bucket(current_list, th_id);
//current_list = (current_list + 1) & ((1 << list_size_log2) - 1);
}
void set_small_primes(void)
{
u32 i,j;
if(small_primes[0] == 0)
{ // initialize the small_primes array
for(j = 0;j < 1024;j++)
small_sieve[0][j] = 0;
for(i = 3;i < 256;i += 2)// 256 ^2 = 65 536
if((small_sieve[0][i >> 6] & (1 << ((i >> 1) & 31))) == 0)
for(j = (i * i) >> 1;j < 32768;j += i)
small_sieve[0][j >> 5] |= 1 << (j & 31);
j = 0;
for(i = 3;i < 65536;i += 2)
if((small_sieve[0][i >> 6] & (1 << ((i >> 1) & 31))) == 0)
small_primes[j++] = i;
if(j != number_of_small_primes)
exit(2); // this should never happen
}
}
//
// main program
//
int main(int argc,char **argv)
{
double t;
u32 i,j;
u64 pi, counter=0;
u64 main_base;
int ntasks = 1;
u32 next_prime = 3;
u32 current_list = 0;
omp_set_num_threads(nthreads);
if(argc == 1)
i = 15;
else
i = atoi(argv[1]);
if(i < 6)
i = 6;
if(i > 18)
i = 18;
printf("%2u %2u",_sieve_bits_log2_,_bucket_size_log2_);
main_base = 1ull;
for(j = 0;j < i;j++)
main_base *= 10ull;
main_limit = main_base + 2000000000ull;
// set list_size_log2
u32 l;
l = 1 + (u32)ceil(sqrt((double)main_limit));
l = 2 + (l >> _sieve_bits_log2_);
for(list_size_log2 = 2;(1 << list_size_log2) < l;list_size_log2++)
;
//set main_lists
for (int i = 0; i < nthreads;i++) {
available_buckets[i] = NULL;
main_lists[i] = (bucket **)get_memory((1 << list_size_log2) * sizeof(bucket *));
for(u32 k = 0;k < (1 << list_size_log2);k++)
{
main_lists[i][k] = NULL;
new_bucket(k, i);
}
}
//set_small_primes
t = (double)clock();
for (int i = 0; i < nthreads;i++) small_base[i] = 0;
set_small_primes();
printf(" %2d",i);
// init main sieve
init_main_sieve(main_base,0, next_prime, current_list);
t = ((double)clock() - t) / (double)CLOCKS_PER_SEC;
printf(" %6.2f",t);
j = 1 << (_sieve_bits_log2_ - 3);
pi = 0ull;
main_limit = main_base + 1000000000ull;
if(((u32)main_base | (u32)main_limit) & 63)
{
fprintf(stderr,"Warning: prime number counts may be incorrect\n");
fprintf(stderr," main_base and main_limit should be multiples of 64\n");
}
// calculate iteration count fast
t = (double)clock();
u64 main_base_tmp = main_base;
const u64 main_base_const = main_base_tmp;
for(;;)
{
i = (u32)(main_limit - main_base_tmp) >> 4;
if(i <= j)
break;
main_base_tmp += (u64)j << 4;
counter++;
}
{
//prepare values
int th_id = omp_get_thread_num();
u64 main_base_private = main_base_const;
u64 end = main_base_private + (u64)(2 << _sieve_bits_log2_);
u32 next_prime_private = next_prime;
while ((u64) next_prime_private * (u64) next_prime_private < end) next_prime_private += 2;
next_prime = next_prime_private;
// call function
do_main_sieve(main_base_private, th_id, next_prime_private, current_list);
// calculate results
pi += (u64)count_zero_bits((u08 *)main_sieve[th_id],j);
}
while (1) printf("B");
#pragma omp parallel for //private (main_base)
for(u64 c=1;c<counter;c++)
{
//prepare values
u32 current_list_private = current_list;
for (u64 count = 0; count < c; count++)
current_list_private = (current_list_private + 1) & ((1 << list_size_log2) - 1);
int th_id = omp_get_thread_num();
u64 main_base_private = main_base_const+((u64)j << 4)*(c);
u64 end = main_base_const+((u64)j << 4)*(c-1) + (u64)(2 << _sieve_bits_log2_);
u32 next_prime_private = next_prime;
while ((u64) next_prime_private * (u64) next_prime_private < end) next_prime_private += 2;
// call function
do_main_sieve(main_base_private, th_id, next_prime_private, current_list_private);
// calculate results
#pragma omp atomic
pi += (u64)count_zero_bits((u08 *)main_sieve[th_id],j);
printf(" %llu",c);
}
main_base = main_base_const+((u64)j << 4)*(counter);
u64 end = main_base + (u64)(2 << _sieve_bits_log2_);
while ((u64) next_prime * (u64) next_prime < end) next_prime += 2;
for (u64 count = 0; count < counter; count++)
current_list = (current_list + 1) & ((1 << list_size_log2) - 1);
do_main_sieve(main_base, 0, next_prime, current_list);
i = (u32)(main_limit - main_base) >> 4;
pi += (u64)count_zero_bits((u08 *)main_sieve[0],i);
t = ((double)clock() - t) / (double)CLOCKS_PER_SEC;
printf(" %7.2f %8llu\n",t,pi);
return 0;
}
我检查了本代码中使用的所有变量,它们应该不依赖于其他任何东西,然后是循环变量(和其他变量仅根据循环变量计算)。再具体一点 next_prime,main_base,small_base,small_sieve,available_buckets_buckets,main_sieve,current_list不应该造成任何麻烦。
如果有人能够看一下并告诉我为什么我总是得到相同的错误结果如果我选择了一个threadnum&gt; 1。 也许有些IDE可以说明这一点,但我很少使用Codelite,也不知道如何获取这些信息。