如何更改并行化的设计,以使该算法可以最有效(而不是完全更改算法)。是否指定#pragma omp simd safelen(1)
会有所帮助。
循环如下(循环5之外的所有循环都不会自动矢量化,并且是该函数的唯一瓶颈)
# pragma omp parallel num_threads(NTt) default(none) private(i,j,k, mythread, dummy) shared(STA,res_sparse_s,COEFF,p_sparse_s, ap_sparse_s,h_sparse_s,RLL, pipi_sparse, normres_sparse, riri_sparse,riri_sparse2,noemer_sparse, nx, ny, nz, nv, PeriodicBoundaryX, PeriodicBoundaryY, PeriodicBoundaryZ)
{
mythread = omp_get_thread_num();//0
// loop 1
#pragma omp for reduction(+:pipi_sparse)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
{
dummy = COEFF[i][j][k][6] * p_sparse_s[i][j][k];
if (PeriodicBoundaryX && i == 1) dummy += COEFF[i][j][k][0] * p_sparse_s[nx ][j][k];
else dummy += COEFF[i][j][k][0] * p_sparse_s[i-1][j][k];
if (PeriodicBoundaryX && i == nx) dummy += COEFF[i][j][k][1] * p_sparse_s[1 ][j][k];
else dummy += COEFF[i][j][k][1] * p_sparse_s[i+1][j][k];
if (PeriodicBoundaryY && j == 1) dummy += COEFF[i][j][k][2] * p_sparse_s[i][ny ][k];
else dummy += COEFF[i][j][k][2] * p_sparse_s[i][j-1][k];
if (PeriodicBoundaryY && j == ny) dummy += COEFF[i][j][k][3] * p_sparse_s[i][ 1][k];
else dummy += COEFF[i][j][k][3] * p_sparse_s[i][j+1][k];
if (PeriodicBoundaryZ && k == 1) dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][nz ];
else dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][k-1];
if (PeriodicBoundaryZ && k == nz) dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][ 1];
else dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][k+1];
ap_sparse_s[i][j][k] = dummy;
pipi_sparse += p_sparse_s[i][j][k] * ap_sparse_s[i][j][k];
}
// loop 2
#pragma omp for reduction(+:normres_sparse)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
{
STA[i][j][k] += (riri_sparse / pipi_sparse) * p_sparse_s[i][j][k];
res_sparse_s[i][j][k] -= (riri_sparse / pipi_sparse) * ap_sparse_s[i][j][k];
normres_sparse += (res_sparse_s[i][j][k] * res_sparse_s[i][j][k])/ nv;// need to take square root separately
}
// loop 3
// FORWARD
#pragma omp for schedule(static, nx/NTt)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
{
dummy = res_sparse_s[i][j][k];
dummy -= COEFF[i][j][k][7] * RLL[i-1][j][k];
if (PeriodicBoundaryX && i==nx)dummy -= COEFF[i][j][k][8] * RLL[1 ][j][k];
dummy -= COEFF[i][j][k][2] * RLL[i][j-1][k];
if (PeriodicBoundaryY && j==ny) dummy -= COEFF[i][j][k][3] * RLL[i][1 ][k];
dummy -= COEFF[i][j][k][4] * RLL[i][j][k-1];
if (PeriodicBoundaryZ && k==nz) dummy -= COEFF[i][j][k][5] * RLL[i][j][1 ];
RLL[i][j][k] = dummy / h_sparse_s[i][j][k];
}
// loop 4
// BACKWARD
#pragma omp for schedule(static, nx/NTt)
for (i=nx; i>=1;i--) for (j=ny; j>=1;j--) for (k=nz; k>=1;k--)
{
dummy = RLL[i][j][k]*h_sparse_s[i][j][k];
if (PeriodicBoundaryX && i==1) dummy -= COEFF[i][j][k][7] * RLL[nx ][j][k];
dummy -= COEFF[i][j][k][8] * RLL[i+1][j][k];
if (PeriodicBoundaryY && j==1) dummy -= COEFF[i][j][k][2] * RLL[i][ny ][k];
dummy -= COEFF[i][j][k][3] * RLL[i][j+1][k];
if (PeriodicBoundaryZ && k==1) dummy -= COEFF[i][j][k][4] * RLL[i][j][nz ];
dummy -= COEFF[i][j][k][5] * RLL[i][j][k+1];
RLL[i][j][k] = dummy / h_sparse_s[i][j][k];
}
// loop 5
#pragma omp for reduction(+:riri_sparse2)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
{
riri_sparse2 += res_sparse_s[i][j][k] * RLL[i][j][k];
}
// loop 6
#pragma omp for
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
{
p_sparse_s[i][j][k] = RLL[i][j][k] + (riri_sparse2 / noemer_sparse) * p_sparse_s[i][j][k];
}
}
noemer_sparse = riri_sparse2;
riri_sparse = riri_sparse2;
return;
}