Question

我正在尝试优化循环，循环100万次，并添加两个使用pow的数组。我的系统有48个核心。我使用malloc来获取数组并使用主进程创建的pthread中的pragma。不幸的是，代码的并行版本比串行版本（在同一系统中）花费的时间多近20倍。我正在使用gettimeofday来检查执行时间。我的gcc版本是4.3.4。请帮助我理解和解决这个问题。

我的代码：

#define N 1000000
#define CHUNKSIZE 20833
:
:
double                          *a, *b, *c;
struct timeval                  st, et;
long double                     time_used[48], tot_time;
:
:
a = malloc(sizeof(double) * N);
b = malloc(sizeof(double) * N);
c = malloc(sizeof(double) * N);
for (i=0; i<N; i++)
     a[i] = b[i] = i * 1.0;
chunk = CHUNKSIZE;
:
:
#pragma omp parallel shared(a,b,c,chunk,time_used) private(i)
    {
        int tid = omp_get_thread_num();
        gettimeofday(&st, NULL);
        long double st_in_micro = (st.tv_sec)*1000000 + (st.tv_usec); 
        #pragma omp for schedule (dynamic,chunk) nowait
        for (i=0; i<N; i++)
            c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);

        gettimeofday(&et, NULL);
        long double et_in_micro = (et.tv_sec)*1000000 + (et.tv_usec);
        time_used[tid] = et_in_micro - st_in_micro;
        printf ("time taken by thread %d = %Lf\n", tid, time_used[tid]);
    }

    tot_time = 0;
    for (i=0; i<48; i++)
    {
        if (time_used[i] < 0)
            continue;
        tot_time += time_used[i];
    }
    printf("Total time taken by all the threads = %Lf\n", tot_time);

并行版本的输出：

time taken by thread 20 = 936.000000
time taken by thread 35 = 1826.000000
time taken by thread 17 = 2.000000
time taken by thread 38 = 603.000000
time taken by thread 22 = 2009.000000
time taken by thread 43 = 0.000000
time taken by thread 13 = 1703.000000
time taken by thread 14 = 1750.000000
time taken by thread 31 = 2128.000000
time taken by thread 1 = 2298.000000
time taken by thread 47 = 602.000000
time taken by thread 34 = 1749.000000
time taken by thread 7 = 1642.000000
time taken by thread 15 = 2542.000000
time taken by thread 9 = 2628.000000
time taken by thread 42 = 3294.000000
time taken by thread 12 = 3446.000000
time taken by thread 30 = 2290.000000
time taken by thread 23 = 3711.000000
time taken by thread 5 = 0.000000
time taken by thread 4 = 2457.000000
time taken by thread 16 = 2573.000000
time taken by thread 6 = 2715.000000
time taken by thread 41 = 2456.000000
time taken by thread 2 = 2877.000000
time taken by thread 0 = 2721.000000
time taken by thread 26 = 4209.000000
time taken by thread 37 = 2796.000000
time taken by thread 24 = 2846.000000
time taken by thread 46 = 2999.000000
time taken by thread 39 = 2569.000000
time taken by thread 45 = 2128.000000
time taken by thread 29 = 2855.000000
time taken by thread 44 = 3075.000000
time taken by thread 36 = 1.000000
time taken by thread 32 = 3035.000000
time taken by thread 3 = 1544.000000
time taken by thread 27 = 3132.000000
time taken by thread 25 = 3076.000000
time taken by thread 33 = 1.000000
time taken by thread 28 = 3042.000000
time taken by thread 21 = 3237.000000
time taken by thread 19 = 1594.000000
time taken by thread 18 = 2202.000000
time taken by thread 10 = 1655.000000
time taken by thread 8 = 3931.000000
time taken by thread 40 = 2726.000000
time taken by thread 11 = 2060.000000
Total time taken by all the threads = 105671.000000

串行版的输出：

Total time taken by all the threads = 5574.000000

请帮助我了解此代码的错误。

Answer 1

您正在将每个线程使用的时间添加到tot_time，并将其与仅使用一个线程的时间进行比较。

在大多数情况下（exception for super linear speedups）对所有线程执行此tot_time将大于或等于仅使用一个线程的时间。理想情况是它们相等，这意味着时间在所有线程中均匀分布。

因此，您对tot_time的定义是衡量负载分配情况的一个有趣指标，但我不认为这是您正在寻找的。

相反，您可以报告占用最长时间的线程的时间。但仅报告并行区域内使用的时间消除了OpenMP实现工作共享的开销成本。相反，我会报告整个工作共享区域使用的时间，如下面的代码所示。

#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>

void foo(double * restrict a, double * restrict b, double * restrict c, int N) {
    double tot_time = -omp_get_wtime();
    #pragma omp parallel
    {
        double dtime = -omp_get_wtime();
        #pragma omp for nowait
        for (int i=0; i<N; i++) c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);
        dtime += omp_get_wtime();
        #pragma omp critical
        printf ("time taken by thread %d = %.2f seconds\n", omp_get_thread_num(), dtime);
    }
    tot_time += omp_get_wtime();
    printf("Total time taken by all the threads = %.2f seconds\n", tot_time);
}

int main(void) {
    int N = 1<<28;
    double *a = malloc(sizeof *a * N);
    double *b = malloc(sizeof *b * N);
    double *c = malloc(sizeof *c * N);
    memset(a,1,sizeof *a * N);
    memset(b,1,sizeof *a * N);
    memset(c,1,sizeof *a * N);
    foo(a,b,c,N);
}

您的代码也有一些我修复的竞争条件。

最后，当第一次分配时，内存不会（通常）分配所有页面，直到它首次被写入为止。有趣的calloc不会分配页面，而只是指向单个零页面。更复杂的是，GCC会将malloc后跟memset(0)转换为calloc。因此，为了实际分配页面，您需要为数组写入非零值（.e.g memset(a,1,sizeof *a * N)）。

这是我的4核/ 8硬件线程系统上的计时结果。

time taken by thread 1 = 0.33 seconds
time taken by thread 5 = 0.33 seconds
time taken by thread 7 = 0.33 seconds
time taken by thread 6 = 0.34 seconds
time taken by thread 3 = 0.34 seconds
time taken by thread 4 = 0.34 seconds
time taken by thread 0 = 0.34 seconds
time taken by thread 2 = 0.33 seconds
Total time taken by all the threads = 0.36 seconds

然后export OMP_NUM_THREADS=2

time taken by thread 0 = 0.31 seconds
time taken by thread 1 = 0.33 seconds
Total time taken by all the threads = 0.33 seconds

然后export OMP_NUM_THREADS=1

time taken by thread 0 = 0.53 seconds
Total time taken by all the threads = 0.53 seconds

你的操作是内存带宽限制所以我在双通道DDR4系统上的两个线程后看不到多少好处。

使用NUMA系统，内存位置会产生很大影响。由于页面仅在第一次触摸时分配，因此对于基准测试，首先并行写入阵列可能是有意义的。以下代码将执行此操作，但它仅适用于静态调度。

确保禁用动态调整线程数（不要与动态调度混淆），例如： export OMP_DYNAMIC=false。

#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>

void foo(double * restrict a, double * restrict b, double * restrict c, int N) {

    #pragma omp parallel for schedule(static)
    for (int i=0; i<N; i++) c[i] = b[i] = a[i] = 1;

    double tot_time = -omp_get_wtime();
    #pragma omp parallel
    {
        double dtime = -omp_get_wtime();
        #pragma omp parallel schedule(static) nowait
        for (int i=0; i<N; i++) c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);
        dtime += omp_get_wtime();
        #pragma omp critical
        printf ("time taken by thread %d = %.2f seconds\n", omp_get_thread_num(), dtime);
    }
    tot_time += omp_get_wtime();
    printf("Total time taken by all the threads = %.2f seconds\n", tot_time);
}

int main(void) {
    int N = 1<<28;
    double *a = malloc(sizeof *a * N);
    double *b = malloc(sizeof *b * N);
    double *c = malloc(sizeof *c * N);
    //memset(a,1,sizeof *a * N);
    //memset(b,1,sizeof *a * N);
    //memset(c,1,sizeof *a * N);    
    foo(a,b,c,N);
}

使用OpenMP优化循环后没有性能提升

1 个答案: