openmp串行比并行快

时间:2018-12-07 19:43:10

标签: c openmp

代码是并行的,但我不知道为什么它比我的串行要慢,当我将线程添加到7到10时,程序也会变慢。

我一直在试图找出问题所在,但对我而言却很困难

我将for循环设为并行,但似乎无法正常工作。运行代码时,我没有收到任何错误。

#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>


int main(int argc, char *argv[])
{

    int m; 
    int n;
    double tol;// = 0.0001;
    double tstart, tstop;

    int i, j, iter, nthreads;



    m = atoi(argv[1]);
    n = atoi(argv[2]);
    tol = atof(argv[3]);

    double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;

    printf("%d %d %lf\n",m,n,tol);
    printf("Enter the number of threads (max 10) ");
    scanf("%d",&nthreads);

    omp_set_num_threads(nthreads);
    tstart = omp_get_wtime ();

    //** initialise temperature array*


    #pragma omp parallel for schedule(static)\
    default(shared) private(i,j)
    for (i=0; i <= m+1; i++) {
        for (j=0; j <= n+1; j++) {
            t[i][j] = 30.0;
        }
    }

    //*** fix boundary conditions***


    for (i=1; i <= m; i++) {
        t[i][0] = 20.0;
        t[i][n+1] = 100.0;
    }
    for (j=1; j <= n; j++) {
        t[0][j] = 10.0;
        t[m+1][j] = 140.0;
    }


    //** main loop**


    iter = 0;
    difmax = 1000000.0;
    while (difmax > tol) {
        iter++;

        // **update temperature for next iteration**


        #pragma omp parallel for schedule(static) \
        default(shared) private(i,j)
        for (i=1; i <= m; i++) {
            for (j=1; j <= n; j++) {
                tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
            }
        }

        // **work out maximum difference between old and new temperatures**

        difmax = 0.0;

        #pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
        {
            priv_difmax = 0.0;
            #pragma omp for schedule(static)
            for (i=1; i <= m; i++) {
                for (j=1; j <= n; j++) {
                    diff = fabs(tnew[i][j]-t[i][j]);

                    if (diff > priv_difmax) {
                        priv_difmax = diff;
                    }

                    //** copy new to old temperatures**
                    t[i][j] = tnew[i][j];
                }
                #pragma omp critical 
                if (priv_difmax > difmax){
                    difmax = priv_difmax;
                }
            }
        }

    }
    tstop = omp_get_wtime ();

    // print results

    printf("iter = %d  difmax = %9.11lf", iter, difmax);

    for (i=0; i <= m+1; i++) {
        printf("\n");
        for (j=0; j <= n+1; j++) {
            printf("%3.5lf ", t[i][j]);
        }
    }

    printf("\n");
    tstop = omp_get_wtime ();

    printf("time taken is %4.3lf\n", (tstop-tstart));
    printf("\n");
}

1 个答案:

答案 0 :(得分:0)

除了以下代码外,我看不到明显的问题:

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
        #pragma omp critical 
        if (priv_difmax > difmax){
            difmax = priv_difmax;
        }
    }
}

priv_difmax复制到difmax的约简部分应移出循环,以使线程仅通过一次critical部分,而不是在外部每次迭代时都通过循环。

#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
    priv_difmax = 0.0;
    #pragma omp for schedule(static) nowait //no need to wait after the loop
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);
                if (diff > priv_difmax) {
                priv_difmax = diff;
            }
                //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }
    // Finish the loop first, then update difmax
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }
} //Implicit barrier

现在,并行化具有开销成本,并且仅对于大的m和n值,可以期望加速。您正在考虑的问题可能太小。减少开销的方法是合并两个parallel构造,这样就不必产生两次线程池。甚至更好的是,将while循环放入parallel构造中,这样我们只需要在每次迭代时同步现有线程,而不必创建和销毁它们:

difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {

    // have one thread reset difmax and increment iter
    #pragma omp single nowait
    iter++,difmax=0.0;

    // loop to update tnew - distributed among threads
    #pragma omp parallel for schedule(static) \
    default(shared) private(i,j)
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
        }
    } //implicit barrier here

    // each thread resets its private difmax
    priv_difmax=0.0;

    // loop to compute difmax - distributed among threads
    #pragma omp for schedule(static) nowait
    for (i=1; i <= m; i++) {
        for (j=1; j <= n; j++) {
            diff = fabs(tnew[i][j]-t[i][j]);

            if (diff > priv_difmax) {
                priv_difmax = diff;
            }

            //** copy new to old temperatures**
            t[i][j] = tnew[i][j];
        }
    }

    // each thread now updates difmax if needed, one at a time
    #pragma omp critical 
    if (priv_difmax > difmax){
        difmax = priv_difmax;
    }

    // put a barrier here to make sure that diffmax have been updated 
    // before any thread tests the condition for next iteration of the
    // while-loop condition
    #pragma omp barrier
}

比较代码以串行和并行方式运行的最好方法是在不支持OpenMP的情况下对其进行编译(例如,使用gcc,使用-fopenmp编译器和链接器标志以及不使用-fopenmp编译器)。这将有助于指出问题出在实际上是并行化还是原始串行代码与“并行就绪”版本之间的其他修改。

这个想法是要知道从原始串行代码并行代码(没有并行支持编译)并行代码(与OpenMP)

需要使用一些预处理头,因为如果没有OpenMP支持,编译器将无法识别类似omp_get_thread_num()之类的函数。 omp_get_wtime()也不应使用;由于您所有的时间花销都是在并行区域之外完成的,因此无需使用该特定功能,并且对time()的调用将是准确的(这要求#include <time.h>)。

// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
   #include <omp.h>
#else
   # ifndef _ESCAPE_OMPENMP
      #define omp_get_num_threads() 1
      #define omp_get_thread_num() 0
      #define omp_get_max_threads() 0
      #define _ESCAPE_OMPENMP
   #endif
#endif