我正在学习并行计算。我写了以下代码
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
void Usage(char* prog_name);
double f(double x); /* Function we're integrating */
double Local_trap(double a, double b, int n);
int main(int argc, char* argv[])
{
double global_result = 0.0; /* Store result in global_result */
double a, b; /* Left and right endpoints */
int n; /* Total number of trapezoids */
int thread_count, repeat_times;
double *time_fork, *time_elapsed, *time_join, *time_end;
double global_start, global_finish, global_time = 0.0;
printf("Enter a, b, n, thread_count and repeat times, n mod thread_count should = 0.\n");
scanf("%lf %lf %d %d %d", &a, &b, &n, &thread_count, &repeat_times);
if (n % thread_count != 0)
Usage(argv[0]);
time_fork = malloc(thread_count * sizeof(double));
time_elapsed = malloc(thread_count * sizeof(double));
time_join = malloc(thread_count * sizeof(double));
time_end = malloc(thread_count * sizeof(double));
if ((NULL == time_elapsed) || (NULL == time_fork) || (NULL == time_join) || (NULL == time_end))
{
return 0;
}
#pragma omp parallel for num_threads(thread_count)
for (int i = 0;i < thread_count;i++)
{
time_fork[i] = 0.0;
time_elapsed[i] = 0.0;
time_join[i] = 0.0;
}
for (int i = 0;i < repeat_times;i++)
{
global_start = omp_get_wtime();
#pragma omp parallel num_threads(thread_count) reduction(+:global_result)
{
/* new code to calculate time elapsed */
#pragma omp barrier
double my_start, my_finish, my_elapsed;
int my_rank = omp_get_thread_num();
my_start = omp_get_wtime();
time_fork[my_rank] += (my_start - global_start);
/* original code to calculate trap */
global_result += Local_trap(a, b, n);
/* new code to calculate time elapsed */
my_finish = omp_get_wtime();
my_elapsed = my_finish - my_start;
time_elapsed[my_rank] += my_elapsed;
time_end[my_rank] = my_finish;
}
global_finish = omp_get_wtime();
#pragma omp parallel for num_threads(thread_count)
for (int j = 0;j < thread_count;j++)
{
time_join[j] += (global_finish - time_end[j]);
}
global_time += (global_finish - global_start);
}
printf("The global run time is %.14f seconds.\n", global_time/repeat_times);
for(int i = 0; i < thread_count;i++)
{
printf("The thread %d runs %.14f seconds.\n", i, time_elapsed[i]/repeat_times);
printf("The thread %d forks %.14f seconds.\n", i, time_fork[i]/repeat_times);
printf("The thread %d joins %.14f seconds.\n", i, time_join[i]/repeat_times);
}
printf("With n = %d trapezoids, our estimate\n", n);
printf("of the integral from %f to %f = %.14e\n", a, b, global_result);
free(time_fork);
free(time_elapsed);
free(time_join);
free(time_end);
return 0;
} /* main */
void Usage(char* prog_name)
{
fprintf(stderr, "usage: %s <number of threads>\n", prog_name);
fprintf(stderr, " number of trapezoids must be evenly divisible by\n");
fprintf(stderr, " number of threads\n");
exit(0);
}
double f(double x)
{
double return_val;
return_val = x*x;
return return_val;
} /* f */
double Local_trap(double a, double b, int n)
{
double h, x, my_result;
double local_a, local_b;
int i, local_n;
int my_rank = omp_get_thread_num();
int thread_count = omp_get_num_threads();
h = (b-a)/n;
local_n = n/thread_count;
local_a = a + my_rank*local_n*h;
local_b = local_a + local_n*h;
my_result = (f(local_a) + f(local_b))/2.0;
for (i = 1; i <= local_n-1; i++)
{
x = local_a + i*h;
my_result += f(x);
}
my_result = my_result*h;
return my_result;
}
我在ubuntu 14.04上编译,我的笔记本电脑是i3,4线程,命令是gcc -g3 -Wall -fopenmp -std=c99 -o Assignment2 Assignment2.c
输出为1 2 12000 2 10
The global run time is 0.00013472399987 seconds.
The thread 0 runs 0.00013350439967 seconds.
The thread 0 forks 0.00000079790025 seconds.
The thread 0 joins 0.00000042169995 seconds.
The thread 1 runs 0.00013322920022 seconds.
The thread 1 forks 0.00000062119998 seconds.
The thread 1 joins 0.00000087359967 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01
1 2 12000 4 10的输出
The global run time is 0.00781751800023 seconds.
The thread 0 runs 0.00006403259995 seconds.
The thread 0 forks 0.00621278830040 seconds.
The thread 0 joins 0.00154069709988 seconds.
The thread 1 runs 0.00006628699975 seconds.
The thread 1 forks 0.00575844590039 seconds.
The thread 1 joins 0.00199278510008 seconds.
The thread 2 runs 0.00006636039980 seconds.
The thread 2 forks 0.00551087460044 seconds.
The thread 2 joins 0.00224028299999 seconds.
The thread 3 runs 0.00006544990010 seconds.
The thread 3 forks 0.00564311910020 seconds.
The thread 3 joins 0.00210894899992 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01
我不知道为什么分叉4个线程的成本与分叉2个线程相比是如此昂贵。 我的测量时间方法有问题吗?
答案 0 :(得分:0)
你的测量时间很好。没有任何优化,越来越多的线程将需要更多的管理/同步。但是,通过-O3
优化,您会看到明确的加速。我有8个线程,因此我的运行时分别为2,4,6,8:
The global run time is 0.00013457789901 seconds.
The global run time is 0.00006983749627 seconds.
The global run time is 0.00004531119484 seconds.
The global run time is 0.00032387300453 seconds.
注意当运行时遇到8个线程时运行时会再次增加。这是并行编程中的常见问题之一。您在计算中的潜在加速需要足够大,以证明与更多处理器通信的成本增加。在这种情况下,8个线程甚至比单线程程序(The global run time is 0.00025965359819 seconds.
)
修改强>
物理核心数和线程数确实不同。一种检查方法是cat /proc/cpuinfo
。输出将列出您的线程。我的猜测是有4.这被称为hyper-threading以增加并行性。但是,您的程序在两种情况下都使用线程而不是核心进行计算。