OpenMP omp fork 2线程比fork 4线程快得多,为什么?

时间:2016-01-18 06:26:51

标签: c linux multithreading openmp

我正在学习并行计算。我写了以下代码

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
void Usage(char* prog_name);
double f(double x);    /* Function we're integrating */
double Local_trap(double a, double b, int n);
int main(int argc, char* argv[])
{
    double  global_result = 0.0;  /* Store result in global_result */
    double  a, b;                 /* Left and right endpoints      */
    int     n;                    /* Total number of trapezoids    */
    int     thread_count, repeat_times;
    double *time_fork, *time_elapsed, *time_join, *time_end;
    double global_start, global_finish, global_time = 0.0;

    printf("Enter a, b, n, thread_count and repeat times, n mod thread_count should = 0.\n");
    scanf("%lf %lf %d %d %d", &a, &b, &n, &thread_count, &repeat_times);

    if (n % thread_count != 0)
        Usage(argv[0]);

    time_fork    = malloc(thread_count * sizeof(double));
    time_elapsed = malloc(thread_count * sizeof(double));
    time_join    = malloc(thread_count * sizeof(double));
    time_end     = malloc(thread_count * sizeof(double));

    if ((NULL == time_elapsed) || (NULL == time_fork) || (NULL == time_join) || (NULL == time_end))
    {
        return 0;
    }

    #pragma omp parallel for num_threads(thread_count)
    for (int i = 0;i < thread_count;i++)
    {
        time_fork[i] = 0.0;
        time_elapsed[i] = 0.0;
        time_join[i] = 0.0;
    }

    for (int i = 0;i < repeat_times;i++)
    {
        global_start = omp_get_wtime();
        #pragma omp parallel num_threads(thread_count) reduction(+:global_result)
        {
            /* new code to calculate time elapsed */
            #pragma omp barrier
            double my_start, my_finish, my_elapsed;
            int my_rank = omp_get_thread_num();
            my_start = omp_get_wtime();
            time_fork[my_rank] += (my_start - global_start);

            /* original code to calculate trap */
            global_result += Local_trap(a, b, n);

            /* new code to calculate time elapsed */
            my_finish  = omp_get_wtime();
            my_elapsed = my_finish - my_start;
            time_elapsed[my_rank] += my_elapsed;
            time_end[my_rank] = my_finish;
        }
        global_finish = omp_get_wtime();

        #pragma omp parallel for num_threads(thread_count)
        for (int j = 0;j < thread_count;j++)
        {
            time_join[j] += (global_finish - time_end[j]);
        }

        global_time += (global_finish - global_start);
    }

    printf("The global run time is %.14f seconds.\n", global_time/repeat_times);
    for(int i = 0; i < thread_count;i++)
    {
        printf("The thread %d runs  %.14f seconds.\n", i, time_elapsed[i]/repeat_times);
        printf("The thread %d forks %.14f seconds.\n", i, time_fork[i]/repeat_times);
        printf("The thread %d joins %.14f seconds.\n", i, time_join[i]/repeat_times);
    }

    printf("With n = %d trapezoids, our estimate\n", n);
    printf("of the integral from %f to %f = %.14e\n", a, b, global_result);

    free(time_fork);
    free(time_elapsed);
    free(time_join);
    free(time_end);
    return 0;
}  /* main */
void Usage(char* prog_name)
{
    fprintf(stderr, "usage: %s <number of threads>\n", prog_name);
    fprintf(stderr, "   number of trapezoids must be evenly divisible by\n");
    fprintf(stderr, "   number of threads\n");
    exit(0);
}
double f(double x)
{
    double return_val;
    return_val = x*x;
    return return_val;
}  /* f */

double Local_trap(double a, double b, int n)
{
    double  h, x, my_result;
    double  local_a, local_b;
    int  i, local_n;
    int my_rank = omp_get_thread_num();
    int thread_count = omp_get_num_threads();

    h = (b-a)/n;
    local_n = n/thread_count;
    local_a = a + my_rank*local_n*h;
    local_b = local_a + local_n*h;
    my_result = (f(local_a) + f(local_b))/2.0;
    for (i = 1; i <= local_n-1; i++)
    {
        x = local_a + i*h;
        my_result += f(x);
    }
    my_result = my_result*h;

    return my_result;
} 

我在ubuntu 14.04上编译,我的笔记本电脑是i3,4线程,命令是gcc -g3 -Wall -fopenmp -std=c99 -o Assignment2 Assignment2.c

输出为1 2 12000 2 10

The global run time is 0.00013472399987 seconds.
The thread 0 runs  0.00013350439967 seconds.
The thread 0 forks 0.00000079790025 seconds.
The thread 0 joins 0.00000042169995 seconds.
The thread 1 runs  0.00013322920022 seconds.
The thread 1 forks 0.00000062119998 seconds.
The thread 1 joins 0.00000087359967 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01

1 2 12000 4 10的输出

The global run time is 0.00781751800023 seconds.
The thread 0 runs  0.00006403259995 seconds.
The thread 0 forks 0.00621278830040 seconds.
The thread 0 joins 0.00154069709988 seconds.
The thread 1 runs  0.00006628699975 seconds.
The thread 1 forks 0.00575844590039 seconds.
The thread 1 joins 0.00199278510008 seconds.
The thread 2 runs  0.00006636039980 seconds.
The thread 2 forks 0.00551087460044 seconds.
The thread 2 joins 0.00224028299999 seconds.
The thread 3 runs  0.00006544990010 seconds.
The thread 3 forks 0.00564311910020 seconds.
The thread 3 joins 0.00210894899992 seconds.
With n = 12000 trapezoids, our estimate
of the integral from 1.000000 to 2.000000 = 2.33333333449074e+01

我不知道为什么分叉4个线程的成本与分叉2个线程相比是如此昂贵。 我的测量时间方法有问题吗?

1 个答案:

答案 0 :(得分:0)

你的测量时间很好。没有任何优化,越来越多的线程将需要更多的管理/同步。但是,通过-O3优化,您会看到明确的加速。我有8个线程,因此我的运行时分别为2,4,6,8:

The global run time is 0.00013457789901 seconds.
The global run time is 0.00006983749627 seconds.
The global run time is 0.00004531119484 seconds.
The global run time is 0.00032387300453 seconds.

注意当运行时遇到8个线程时运行时会再次增加。这是并行编程中的常见问题之一。您在计算中的潜在加速需要足够大,以证明与更多处理器通信的成本增加。在这种情况下,8个线程甚至比单线程程序(The global run time is 0.00025965359819 seconds.

修改 物理核心数和线程数确实不同。一种检查方法是cat /proc/cpuinfo。输出将列出您的线程。我的猜测是有4.这被称为hyper-threading以增加并行性。但是,您的程序在两种情况下都使用线程而不是核心进行计算。