我正在尝试编写一个并行程序,该程序的错误率(即0.01)并返回的PI值比montecarlo模拟的错误更接近PI。 我写了一个简单的函数,但是它不会终止,因为错误率始终在11左右。 感谢您的评论。
#include "stdio.h"
#include "omp.h"
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
double drand48(void);
double monte_carlo(double epsilon){
double x,y, pi_estimate = 0.0;
double drand48(void);
double error = 10000.0;
int n = 0; // total number of points
int i = 0; // total numbers of points inside circle
int p = omp_get_num_threads();
while(error>=epsilon){
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
}
n+=p;
printf("%lf\n", error);
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
}
return pi_estimate;
}
int main(int argc, char* argv[]) {
double epsilon = 0.01;
printf("PI estimate: %lf",monte_carlo(epsilon));
return 0;
}
答案 0 :(得分:2)
在并行段之外调用omp_get_num_threads()
将始终返回1
,因为在调用该函数时只有一个活动线程。下面的代码应该给出正确的结果,但是由于执行非常简单的操作会花费大量的并行化和同步开销,因此它将比串行版本慢得多。
#pragma omp parallel private(x, y) reduction(+:i)//OMP parallel directive
{
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){i+=1;}
#pragma omp master
n+=omp_get_num_threads();
}
以下内容避免了重复生成线程,并且可能会更有效,但可能仍然更慢。
#pragma omp parallel private(x, y)
while(error>=epsilon){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0){
#pragma omp atomic
i++;
}
#pragma omp barrier
#pragma omp single
{
n+=omp_get_num_threads();
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier here
}
为了真正更快,应该给出最小的迭代次数,例如:
#define ITER 1000
#pragma omp parallel private(x, y)
while(error>=epsilon){
#pragma omp for reduction(+:i)
for (int j=1;j<ITER;j++){
x = drand48();
y = drand48();
if((x*x+y*y)<=1.0) i+=1;
}
/* implicit barrier + implicit atomic addition
* of thread-private accumulator to shared variable i
*/
#pragma omp single
{
n+=ITER;
pi_estimate=4.0*(double)i/(double)n;
error = fabs(M_PI-pi_estimate)/M_PI;
printf("%lf\n", error);
} // implicit barrier
}