这是一个主要的生成器程序,当处理器int numProc
的数量为1时它工作正常,但如果用户认为有多个处理器,我想使用并行化的质数版本,以便生成分发给许多进程。
注意:我已在int main中取出标志处理和菜单显示代码以节省空间
代码:
#define NUMITERS 1
#define MAXSIZE 500000
#define MAXNUMPROCS 64
#define PRIME(num) (2*(num) + 3)
#define NUM(prime) (((prime) - 3)/2)
#define TRUE 1
#define FALSE 0
int lastPrime, count; /* Last Prime and Number of Primes Found */
int size = 100000; /* Number of numbers to test for prime */
int numProcs = 5; /* Number of processors */
FILE *out = NULL; /* File to output primes to */
char *flags; /* Array of primes (odd numbers only)
i.e. flags[0] corresponds to 3
flags[1] corresponds to 5
flags[n] corresponds to 2*n+3
flags[i] is TRUE if i is a prime */
void primes(void); /* procedure prototype */
void parallelPrimes(void); /* procedure prototype */
int main(int argc, char *argv[])
{
clock_t t;
t = clock();
int i, opt;
/* MENU DISPLAY and flag operations here */
if (numProcs == 1)
primes(); /* Call primes routine */
else
parallelPrimes(); /* call multi-threaded/parallelized
/* print out all of the primes found */
if (out != NULL) {
int i;
fprintf(out, "2\n");
for (i = 0; i < size; ++i)
if (flags[i])
fprintf(out, "%d\n", PRIME(i));
}
free(flags);
printf(" Number of primes = %d, largest prime = %d\n", count, lastPrime);
t = clock() - t;
printf ("It took me %d clicks (%f seconds).\n",t,((float)t)/CLOCKS_PER_SEC);
}
void primes()
{
int i,iter, prime, div1, div2, rem;
for (iter=0; iter < NUMITERS; ++iter)
{
count = 0;
lastPrime = 0;
for (i=0; i < size; ++i) { /* For every odd number */
prime = PRIME(i);
div1=1;
do {
div1 += 2; /* Divide by 3, 5, 7, ... */
div2 = prime / div1; /* Find the dividend */
rem = prime % div1; /* Find remainder */
} while (rem != 0 && div1 <= div2);
if (rem != 0 || div1 == prime) {
/* prime is really a prime */
flags[i] = TRUE;
count++;
lastPrime = prime;
} else {
/* prime is not a prime */
flags[i] = FALSE;
}
}
}
}
到目前为止,我已尝试在以下函数中使用#include<omp.h>
:
void parallelPrimes()
{
int i;
int iter, prime;
int div1, div2, rem;
for (iter=0; iter < NUMITERS; ++iter)
/*
Don't parallelize this loop??? */
{
count = 0;
lastPrime = 0;
omp_set_num_threads(numProcs);
#pragma omp parallel
int ID = omp_get_thread_num();
#pragma omp for
for (i=0; i < size; ++i) { /* For every odd number */
prime = PRIME(i);
/* Keep searching for divisor until rem == 0 (i.e. non prime),
or we've reached the sqrt of prime (when div1 > div2) */
div1=1;
do {
div1 += 2; /* Divide by 3, 5, 7, ... */
div2 = prime / div1; /* Find the dividend */
rem = prime % div1; /* Find remainder */
} while (rem != 0 && div1 <= div2);
if (rem != 0 || div1 == prime) {
/* prime is really a prime */
flags[i] = TRUE;
count++;
lastPrime = prime;
} else {
/* prime is not a prime */
flags[i] = FALSE;
}
}
}
}
但它实际上比非并行化函数primes()
慢,我是否在这个函数中错误地实现了并行性?
答案 0 :(得分:2)
[编辑]我建议查看TBB的例子&#39; primes&#39;。但我承认,将它移植到OpenMP并不是一件容易的事。
它使用parallel_reduce
(用于计算素数)与自定义SieveRange
,但它仍然可以转换为omp parallel
,因为范围本质上是:
NumberType middle = r.my_begin + (r.my_end-r.my_begin+r.my_stride-1)/2;
middle = middle/my_stride*my_stride;
即。为通常的parallel_for方法增加了一个步伐。
另一个障碍是Sieve
仿函数持有Multiplies
类的私有副本,应该重新编写以匹配openmp方法。
E.g。 ParallelCountPrimes()
可以使用这个并行结构重新编写:
#pragma omp parallel
{
Multiples multiples(n); // per-thread copy, needs reworking to remain 'uninitialized'
NumberType m = multiples.m;
#pragma omp for
for(NumberType i = multiples.m; i < n; i += multiples.m*CHUNKSIZE) {
if( !multiples.is_initialized() )
multiples.initialize( i ); // initialize by index of the first iteration
NumberType window_size = m;
NumberType end = i+multiples.m*CHUNKSIZE;
if(end>n) end = n;
for( NumberType j=i; j<end; j+=window_size ) {
if( j+window_size>end )
window_size = end-j;
multiples.find_primes_in_window( j, window_size );
}
}
}