我正在尝试优化循环,循环100万次,并添加两个使用pow
的数组。我的系统有48个核心。我使用malloc来获取数组并使用主进程创建的pthread中的pragma。不幸的是,代码的并行版本比串行版本(在同一系统中)花费的时间多近20倍。我正在使用gettimeofday
来检查执行时间。我的gcc版本是4.3.4。请帮助我理解和解决这个问题。
我的代码:
#define N 1000000
#define CHUNKSIZE 20833
:
:
double *a, *b, *c;
struct timeval st, et;
long double time_used[48], tot_time;
:
:
a = malloc(sizeof(double) * N);
b = malloc(sizeof(double) * N);
c = malloc(sizeof(double) * N);
for (i=0; i<N; i++)
a[i] = b[i] = i * 1.0;
chunk = CHUNKSIZE;
:
:
#pragma omp parallel shared(a,b,c,chunk,time_used) private(i)
{
int tid = omp_get_thread_num();
gettimeofday(&st, NULL);
long double st_in_micro = (st.tv_sec)*1000000 + (st.tv_usec);
#pragma omp for schedule (dynamic,chunk) nowait
for (i=0; i<N; i++)
c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);
gettimeofday(&et, NULL);
long double et_in_micro = (et.tv_sec)*1000000 + (et.tv_usec);
time_used[tid] = et_in_micro - st_in_micro;
printf ("time taken by thread %d = %Lf\n", tid, time_used[tid]);
}
tot_time = 0;
for (i=0; i<48; i++)
{
if (time_used[i] < 0)
continue;
tot_time += time_used[i];
}
printf("Total time taken by all the threads = %Lf\n", tot_time);
并行版本的输出:
time taken by thread 20 = 936.000000
time taken by thread 35 = 1826.000000
time taken by thread 17 = 2.000000
time taken by thread 38 = 603.000000
time taken by thread 22 = 2009.000000
time taken by thread 43 = 0.000000
time taken by thread 13 = 1703.000000
time taken by thread 14 = 1750.000000
time taken by thread 31 = 2128.000000
time taken by thread 1 = 2298.000000
time taken by thread 47 = 602.000000
time taken by thread 34 = 1749.000000
time taken by thread 7 = 1642.000000
time taken by thread 15 = 2542.000000
time taken by thread 9 = 2628.000000
time taken by thread 42 = 3294.000000
time taken by thread 12 = 3446.000000
time taken by thread 30 = 2290.000000
time taken by thread 23 = 3711.000000
time taken by thread 5 = 0.000000
time taken by thread 4 = 2457.000000
time taken by thread 16 = 2573.000000
time taken by thread 6 = 2715.000000
time taken by thread 41 = 2456.000000
time taken by thread 2 = 2877.000000
time taken by thread 0 = 2721.000000
time taken by thread 26 = 4209.000000
time taken by thread 37 = 2796.000000
time taken by thread 24 = 2846.000000
time taken by thread 46 = 2999.000000
time taken by thread 39 = 2569.000000
time taken by thread 45 = 2128.000000
time taken by thread 29 = 2855.000000
time taken by thread 44 = 3075.000000
time taken by thread 36 = 1.000000
time taken by thread 32 = 3035.000000
time taken by thread 3 = 1544.000000
time taken by thread 27 = 3132.000000
time taken by thread 25 = 3076.000000
time taken by thread 33 = 1.000000
time taken by thread 28 = 3042.000000
time taken by thread 21 = 3237.000000
time taken by thread 19 = 1594.000000
time taken by thread 18 = 2202.000000
time taken by thread 10 = 1655.000000
time taken by thread 8 = 3931.000000
time taken by thread 40 = 2726.000000
time taken by thread 11 = 2060.000000
Total time taken by all the threads = 105671.000000
串行版的输出:
Total time taken by all the threads = 5574.000000
请帮助我了解此代码的错误。
答案 0 :(得分:1)
您正在将每个线程使用的时间添加到tot_time
,并将其与仅使用一个线程的时间进行比较。
在大多数情况下(exception for super linear speedups)对所有线程执行此tot_time
将大于或等于仅使用一个线程的时间。理想情况是它们相等,这意味着时间在所有线程中均匀分布。
因此,您对tot_time
的定义是衡量负载分配情况的一个有趣指标,但我不认为这是您正在寻找的。
相反,您可以报告占用最长时间的线程的时间。但仅报告并行区域内使用的时间消除了OpenMP实现工作共享的开销成本。相反,我会报告整个工作共享区域使用的时间,如下面的代码所示。
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
void foo(double * restrict a, double * restrict b, double * restrict c, int N) {
double tot_time = -omp_get_wtime();
#pragma omp parallel
{
double dtime = -omp_get_wtime();
#pragma omp for nowait
for (int i=0; i<N; i++) c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);
dtime += omp_get_wtime();
#pragma omp critical
printf ("time taken by thread %d = %.2f seconds\n", omp_get_thread_num(), dtime);
}
tot_time += omp_get_wtime();
printf("Total time taken by all the threads = %.2f seconds\n", tot_time);
}
int main(void) {
int N = 1<<28;
double *a = malloc(sizeof *a * N);
double *b = malloc(sizeof *b * N);
double *c = malloc(sizeof *c * N);
memset(a,1,sizeof *a * N);
memset(b,1,sizeof *a * N);
memset(c,1,sizeof *a * N);
foo(a,b,c,N);
}
您的代码也有一些我修复的竞争条件。
最后,当第一次分配时,内存不会(通常)分配所有页面,直到它首次被写入为止。有趣的calloc
不会分配页面,而只是指向单个零页面。更复杂的是,GCC会将malloc
后跟memset(0)
转换为calloc
。因此,为了实际分配页面,您需要为数组写入非零值(.e.g memset(a,1,sizeof *a * N)
)。
这是我的4核/ 8硬件线程系统上的计时结果。
time taken by thread 1 = 0.33 seconds
time taken by thread 5 = 0.33 seconds
time taken by thread 7 = 0.33 seconds
time taken by thread 6 = 0.34 seconds
time taken by thread 3 = 0.34 seconds
time taken by thread 4 = 0.34 seconds
time taken by thread 0 = 0.34 seconds
time taken by thread 2 = 0.33 seconds
Total time taken by all the threads = 0.36 seconds
然后export OMP_NUM_THREADS=2
time taken by thread 0 = 0.31 seconds
time taken by thread 1 = 0.33 seconds
Total time taken by all the threads = 0.33 seconds
然后export OMP_NUM_THREADS=1
time taken by thread 0 = 0.53 seconds
Total time taken by all the threads = 0.53 seconds
你的操作是内存带宽限制所以我在双通道DDR4系统上的两个线程后看不到多少好处。
使用NUMA系统,内存位置会产生很大影响。由于页面仅在第一次触摸时分配,因此对于基准测试,首先并行写入阵列可能是有意义的。以下代码将执行此操作,但它仅适用于静态调度。
确保禁用动态调整线程数(不要与动态调度混淆),例如: export OMP_DYNAMIC=false
。
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h>
#include <stdio.h>
void foo(double * restrict a, double * restrict b, double * restrict c, int N) {
#pragma omp parallel for schedule(static)
for (int i=0; i<N; i++) c[i] = b[i] = a[i] = 1;
double tot_time = -omp_get_wtime();
#pragma omp parallel
{
double dtime = -omp_get_wtime();
#pragma omp parallel schedule(static) nowait
for (int i=0; i<N; i++) c[i] = a[i]*pow(2,2) + b[i]*pow(3,2);
dtime += omp_get_wtime();
#pragma omp critical
printf ("time taken by thread %d = %.2f seconds\n", omp_get_thread_num(), dtime);
}
tot_time += omp_get_wtime();
printf("Total time taken by all the threads = %.2f seconds\n", tot_time);
}
int main(void) {
int N = 1<<28;
double *a = malloc(sizeof *a * N);
double *b = malloc(sizeof *b * N);
double *c = malloc(sizeof *c * N);
//memset(a,1,sizeof *a * N);
//memset(b,1,sizeof *a * N);
//memset(c,1,sizeof *a * N);
foo(a,b,c,N);
}