我正在尝试并行化点积运算,并使用OpenMP测量在不同数量的核上运行的运行时间。我得到的结果是,如果N = 1e9,则1核心的CPU时间为5.6秒,8核心为6.0秒,16核心为10.8秒。当我使用更多内核时,为什么计算时间会增加?
这是我的代码:
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <omp.h>
#define DATA_TYPE float
const int N = 1e9;
int main ()
{
int i, nthreads, tid;
DATA_TYPE x_par, *y, *z, cput_par;
clock_t start, end;
y = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
z = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
for (i=0; i<N; i++) {
y[i] = i * 1.0;
z[i] = i * 2.0;
}
x_par = 0;
//nthreads = omp_get_max_threads();
nthreads = 1;
printf("n threads = %d\n", nthreads);
start=clock();
omp_set_num_threads(nthreads);
#pragma omp parallel for reduction(+:x_par)
for (i=0; i<N; i++)
{
x_par += y[i] * z[i];
}
end=clock();
cput_par = ((double)(end-start)/(double)(CLOCKS_PER_SEC));
printf("Parallel time use: %f\n", cput_par);
printf("x_par = %f\n", x_par);
return 0;
}
答案 0 :(得分:1)
故障是计算所有核心/线程的总CPU时间。为了获得每个线程给出的平均cpu时间,该值需要除以线程数。解决它的另一种方法可以是测量壁时间(即操作之前和之后的实际时间的差异)。如果使用了walltime,那么操作系统可能会在其间运行另一个程序,然后它也包含在walltime中。为了说明这一点,以及对严格顺序案例的比较,我发布了这段代码:
#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h> //gettimeofday()
#include <time.h>
#include <omp.h>
#define DATA_TYPE float
const int N = 1e9;
int main ()
{
int i, nthreads, tid;
DATA_TYPE x_seq, x_par, *y, *z;
struct timeval time;
double tstart_cpu, tend_cpu, tstart_wall, tend_wall;
double walltime_seq, walltime_par, cputime_seq, cputime_par;
nthreads = 8;
printf("- - -DOT PROCUCT: OPENMP - - -\n");
printf("Vector size : %d\n", N);
printf("Number of threads used: %d\n", nthreads);
// INITIALIZATION
y = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
z = (DATA_TYPE*)malloc(sizeof(DATA_TYPE)*N);
for (i=0; i<N; i++) {
y[i] = i * 1.0;
z[i] = i * 2.0;
}
x_seq = 0;
x_par = 0;
// SEQUENTIAL CASE
gettimeofday(&time, NULL);
tstart_cpu = (double)clock()/CLOCKS_PER_SEC;
tstart_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
for (i=0; i<N; i++) x_seq += y[i] * z[i];
tend_cpu = (double)clock()/CLOCKS_PER_SEC;
gettimeofday(&time, NULL);
tend_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
cputime_seq = tend_cpu-tstart_cpu;
walltime_seq = tend_wall - tstart_wall;
printf("Sequential CPU time: %f\n", cputime_seq);
printf("Sequential Walltime: %f\n", walltime_seq);
printf("Sequential result : %f\n", x_seq);
// PARALLEL CASE
gettimeofday(&time, NULL);
tstart_cpu = (double)clock()/CLOCKS_PER_SEC;
tstart_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
omp_set_num_threads(nthreads);
#pragma omp parallel for reduction(+:x_par)
for (i=0; i<N; i++)
{
x_par += y[i] * z[i];
}
tend_cpu = (double)clock()/CLOCKS_PER_SEC;
gettimeofday(&time, NULL);
tend_wall = (double)time.tv_sec + (double)time.tv_usec * .000001;
cputime_par = tend_cpu - tstart_cpu;
walltime_par = tend_wall - tstart_wall;
cputime_par /= nthreads; // take the average cpu time per thread
printf("Parallel CPU time : %f\n", cputime_par);
printf("Parallel Walltime : %f\n", walltime_par);
printf("Parallel result : %f\n", x_par);
// SPEEDUP
printf("Speedup (cputime) : %f\n", cputime_seq/cputime_par);
printf("Speedup (walltime) : %f\n", walltime_seq/walltime_par);
return 0;
}
它的典型运行输出:
- - -DOT PROCUCT: OPENMP - - -
Vector size : 1000000000
Number of threads used: 8
Sequential CPU time: 4.871956
Sequential Walltime: 4.878946
Sequential result : 38685626227668133590597632.000000
Parallel CPU time : 0.751475
Parallel Walltime : 0.757933
Parallel result : 133586303067416523805032448.000000
Speedup (cputime) : 6.483191
Speedup (walltime) : 6.437172
正如您所看到的,产生的点积不正确,但这回答了最初的问题。