代码是并行的,但我不知道为什么它比我的串行要慢,当我将线程添加到7到10时,程序也会变慢。
我一直在试图找出问题所在,但对我而言却很困难
我将for循环设为并行,但似乎无法正常工作。运行代码时,我没有收到任何错误。
#include <stdio.h>
#include <math.h>
#include <omp.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
int m;
int n;
double tol;// = 0.0001;
double tstart, tstop;
int i, j, iter, nthreads;
m = atoi(argv[1]);
n = atoi(argv[2]);
tol = atof(argv[3]);
double t[m+2][n+2], tnew[m+1][n+1], diff, difmax,priv_difmax;
printf("%d %d %lf\n",m,n,tol);
printf("Enter the number of threads (max 10) ");
scanf("%d",&nthreads);
omp_set_num_threads(nthreads);
tstart = omp_get_wtime ();
//** initialise temperature array*
#pragma omp parallel for schedule(static)\
default(shared) private(i,j)
for (i=0; i <= m+1; i++) {
for (j=0; j <= n+1; j++) {
t[i][j] = 30.0;
}
}
//*** fix boundary conditions***
for (i=1; i <= m; i++) {
t[i][0] = 20.0;
t[i][n+1] = 100.0;
}
for (j=1; j <= n; j++) {
t[0][j] = 10.0;
t[m+1][j] = 140.0;
}
//** main loop**
iter = 0;
difmax = 1000000.0;
while (difmax > tol) {
iter++;
// **update temperature for next iteration**
#pragma omp parallel for schedule(static) \
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
}
// **work out maximum difference between old and new temperatures**
difmax = 0.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}
}
tstop = omp_get_wtime ();
// print results
printf("iter = %d difmax = %9.11lf", iter, difmax);
for (i=0; i <= m+1; i++) {
printf("\n");
for (j=0; j <= n+1; j++) {
printf("%3.5lf ", t[i][j]);
}
}
printf("\n");
tstop = omp_get_wtime ();
printf("time taken is %4.3lf\n", (tstop-tstart));
printf("\n");
}
答案 0 :(得分:0)
除了以下代码外,我看不到明显的问题:
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
}
}
将priv_difmax
复制到difmax
的约简部分应移出循环,以使线程仅通过一次critical
部分,而不是在外部每次迭代时都通过循环。
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
{
priv_difmax = 0.0;
#pragma omp for schedule(static) nowait //no need to wait after the loop
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// Finish the loop first, then update difmax
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
} //Implicit barrier
现在,并行化具有开销成本,并且仅对于大的m和n值,可以期望加速。您正在考虑的问题可能太小。减少开销的方法是合并两个parallel
构造,这样就不必产生两次线程池。甚至更好的是,将while循环放入parallel
构造中,这样我们只需要在每次迭代时同步现有线程,而不必创建和销毁它们:
difmax=1000000.0;
#pragma omp parallel default(shared) private(i, j, diff, priv_difmax)
while (difmax > tol) {
// have one thread reset difmax and increment iter
#pragma omp single nowait
iter++,difmax=0.0;
// loop to update tnew - distributed among threads
#pragma omp parallel for schedule(static) \
default(shared) private(i,j)
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
tnew[i][j] = (t[i-1][j]+t[i+1][j]+t[i][j-1]+t[i][j+1])/4.0;
}
} //implicit barrier here
// each thread resets its private difmax
priv_difmax=0.0;
// loop to compute difmax - distributed among threads
#pragma omp for schedule(static) nowait
for (i=1; i <= m; i++) {
for (j=1; j <= n; j++) {
diff = fabs(tnew[i][j]-t[i][j]);
if (diff > priv_difmax) {
priv_difmax = diff;
}
//** copy new to old temperatures**
t[i][j] = tnew[i][j];
}
}
// each thread now updates difmax if needed, one at a time
#pragma omp critical
if (priv_difmax > difmax){
difmax = priv_difmax;
}
// put a barrier here to make sure that diffmax have been updated
// before any thread tests the condition for next iteration of the
// while-loop condition
#pragma omp barrier
}
比较代码以串行和并行方式运行的最好方法是在不支持OpenMP的情况下对其进行编译(例如,使用gcc,使用-fopenmp编译器和链接器标志以及不使用-fopenmp编译器)。这将有助于指出问题出在实际上是并行化还是原始串行代码与“并行就绪”版本之间的其他修改。
这个想法是要知道从原始串行代码到并行代码(没有并行支持编译)到并行代码(与OpenMP)
需要使用一些预处理头,因为如果没有OpenMP支持,编译器将无法识别类似omp_get_thread_num()
之类的函数。 omp_get_wtime()
也不应使用;由于您所有的时间花销都是在并行区域之外完成的,因此无需使用该特定功能,并且对time()
的调用将是准确的(这要求#include <time.h>
)。
// This part is necessary for the code to run whether it is compiled or not with OpenMP
#ifdef _OPENMP
#include <omp.h>
#else
# ifndef _ESCAPE_OMPENMP
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#define omp_get_max_threads() 0
#define _ESCAPE_OMPENMP
#endif
#endif