我知道它存在这个帖子 openMP performance
但这里我的例子非常简单
C代码:
int MaFunc(size_t szGlobalWorkSize)
{
int iGID = 0;
float *pfResult = (float *)calloc(szGlobalWorkSize * 100, sizeof(float));
float fValue = 0.5f;
struct timeval tim;
gettimeofday(&tim, NULL);
double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
#pragma omp parallel for
for (iGID = 0; iGID < (int)szGlobalWorkSize * 100; iGID++)
{
pfResult[iGID] = fValue;
// printf("Element %d traité par le thread %d \n",iGID,omp_get_thread_num());
}
gettimeofday(&tim, NULL);
double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf Time OMP\n", tLaunch2-tLaunch1);
}
当我使用openMP时,此示例的时间会增加 使用openMP时没有openMP的0.015s而0.045秒(szGlobalworkSize = 131072)
我使用这行gcc: gcc -march = native -fopenmp -O3 MyCode.c -lm
gcc(GCC)4.8.2 20140120(Red Hat 4.8.2-15)
EDIT1:
int MyFunc2()
{
int iGID = 0;
int j = 0;
//float *pfResult = (float *)calloc(szGlobalWorkSize * 100, sizeof(float));
float *pfResult = (float *)valloc(szGlobalWorkSize * 100* sizeof(float));
float fValue = 0.5f;
struct timeval tim;
gettimeofday(&tim, NULL);
double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
double time = omp_get_wtime();
int iChunk = getpagesize();
int iSize = ((int)szGlobalWorkSize * 100) / iChunk;
// #pragma omp parallel
#pragma omp parallel for
for (iGID = 0; iGID < iSize; iGID++)
{
for (j = 0; j < iChunk; j++)
{
pfResult[iGID * iChunk + j] = fValue;
//pfResult[iGID] = fValue;
}
// printf("Element %d traité par le thread %d \n",iGID,omp_get_thread_num());
}
time = omp_get_wtime() - time;
gettimeofday(&tim, NULL);
double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf Time OMP\n", tLaunch2-tLaunch1);
printf("Pagesize=%d\n", getpagesize());
printf("%.6lf Time OMP2\n", time);
}
同时与memalign的块相同
按线程计时编辑2
#pragma omp parallel private(dLocalTime)
{
pdTime[omp_get_thread_num()] = omp_get_wtime();
printf("Thread Begin %d Time %f\n", omp_get_thread_num(), pdTime[omp_get_thread_num()] );
#pragma omp for
for (iGID = 0; iGID < iSize; iGID++)
{
// for (j = 0; j < iChunk; j++)
{
// pfResult[iGID * iChunk + j] = fValue;
pfResult[iGID] = fValue;
}
}
//dLocalTime = (omp_get_wtime() - dLocalTime);
pdTime[omp_get_thread_num()] = (omp_get_wtime() - pdTime[omp_get_thread_num()]);
printf("Thread End %d Time %f\n", omp_get_thread_num(), pdTime[omp_get_thread_num()]);
// printf("End Element %d traité par le thread %d \n",0,tid);
}
每个线程占用0.015,总共0.045,因此openmp中的修复部分为0.03 奇怪的是,即使是巨大的尺寸,我们也看到这个修复openmp和thread的部分,其工作量少于整个尺寸(这里有48个线程)
由于
答案 0 :(得分:0)
好的,既然你坚持......:)
使用固定线程预热:
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <omp.h>
#include <unistd.h>
int main()
{
int szGlobalWorkSize = 131072;
int iGID = 0;
int j = 0;
omp_set_dynamic(0);
// warmup
#if WARMUP
#pragma omp parallel
{
#pragma omp master
{
printf("%d threads\n", omp_get_num_threads());
}
}
#endif
printf("Pagesize=%d\n", getpagesize());
float *pfResult = (float *)valloc(szGlobalWorkSize * 100* sizeof(float));
float fValue = 0.5f;
struct timeval tim;
gettimeofday(&tim, NULL);
double tLaunch1=tim.tv_sec+(tim.tv_usec/1000000.0);
double time = omp_get_wtime();
int iChunk = getpagesize();
int iSize = ((int)szGlobalWorkSize * 100) / iChunk;
#pragma omp parallel for
for (iGID = 0; iGID < iSize; iGID++)
{
for (j = 0; j < iChunk; j++)
pfResult[iGID * iChunk + j] = fValue;
}
time = omp_get_wtime() - time;
gettimeofday(&tim, NULL);
double tLaunch2=tim.tv_sec+(tim.tv_usec/1000000.0);
printf("%.6lf Time1\n", tLaunch2-tLaunch1);
printf("%.6lf Time2\n", time);
}
我的机器上有以下数字:
$ g++ -O2 -fopenmp testomp.cpp && OMP_NUM_THREADS=1 ./a.out
Pagesize=4096
0.036493 Time1
0.036489 Time2
$ g++ -O2 -fopenmp testomp.cpp && ./a.out
Pagesize=4096
0.034721 Time1
0.034718 Time2
$ g++ -O2 -fopenmp testomp.cpp -DWARMUP && ./a.out
24 threads
Pagesize=4096
0.026966 Time1
0.026963 Time2
正如您所看到的,线程创建时间对数字的贡献很大。
为什么它还没有扩展?嗯,这是极其受内存限制的工作负载。实际上,它会填充两次页面:一旦操作系统在第一次触摸时清除它,程序就会通过该值填充它。
似乎系统中没有足够的内存带宽。我不希望错误共享在这里发挥重要作用,因为parallel for
默认情况下使用的静态调度不会交错线程之间的迭代,因此错误共享只能在边界上进行一次。< / p>