我有一个嵌套的for
循环,用于遍历三维空间(每个维度一个)。嵌套循环形成基于模板的矩阵求解器的一部分,该求解器具有与数据相关的操作。我已经遍历了很多链接/在线材料,深入研究了循环转换的细节,似乎循环倾斜可以为我提供帮助。尽管对于2d网格(由两个循环嵌套组成)来说是相当简单的,但我发现很难扩展到3d。循环看起来像这样。
# pragma omp parallel num_threads(NTt) default(none) private(i,j,k, mythread, dummy) shared(STA,res_sparse_s,COEFF,p_sparse_s, ap_sparse_s,h_sparse_s,RLL, pipi_sparse, normres_sparse, riri_sparse,riri_sparse2,noemer_sparse, nx, ny, nz, nv, PeriodicBoundaryX, PeriodicBoundaryY, PeriodicBoundaryZ)
{
mythread = omp_get_thread_num();//0
// loop 1
#pragma omp for reduction(+:pipi_sparse)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
{
dummy = COEFF[i][j][k][6] * p_sparse_s[i][j][k];
if (PeriodicBoundaryX && i == 1) dummy += COEFF[i][j][k][0] * p_sparse_s[nx ][j][k];
else dummy += COEFF[i][j][k][0] * p_sparse_s[i-1][j][k];
if (PeriodicBoundaryX && i == nx) dummy += COEFF[i][j][k][1] * p_sparse_s[1 ][j][k];
else dummy += COEFF[i][j][k][1] * p_sparse_s[i+1][j][k];
if (PeriodicBoundaryY && j == 1) dummy += COEFF[i][j][k][2] * p_sparse_s[i][ny ][k];
else dummy += COEFF[i][j][k][2] * p_sparse_s[i][j-1][k];
if (PeriodicBoundaryY && j == ny) dummy += COEFF[i][j][k][3] * p_sparse_s[i][ 1][k];
else dummy += COEFF[i][j][k][3] * p_sparse_s[i][j+1][k];
if (PeriodicBoundaryZ && k == 1) dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][nz ];
else dummy += COEFF[i][j][k][4] * p_sparse_s[i][j][k-1];
if (PeriodicBoundaryZ && k == nz) dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][ 1];
else dummy += COEFF[i][j][k][5] * p_sparse_s[i][j][k+1];
ap_sparse_s[i][j][k] = dummy;
pipi_sparse += p_sparse_s[i][j][k] * ap_sparse_s[i][j][k];
}
// loop 2
// FORWARD
#pragma omp for schedule(static, nx/NTt)
for (i=1; i<=nx; i++) for (j=1; j<=ny; j++) for (k=1; k<=nz; k++)
{
dummy = res_sparse_s[i][j][k];
dummy -= COEFF[i][j][k][7] * RLL[i-1][j][k];
if (PeriodicBoundaryX && i==nx)dummy -= COEFF[i][j][k][8] * RLL[1 ][j][k];
dummy -= COEFF[i][j][k][2] * RLL[i][j-1][k];
if (PeriodicBoundaryY && j==ny) dummy -= COEFF[i][j][k][3] * RLL[i][1 ][k];
dummy -= COEFF[i][j][k][4] * RLL[i][j][k-1];
if (PeriodicBoundaryZ && k==nz) dummy -= COEFF[i][j][k][5] * RLL[i][j][1 ];
RLL[i][j][k] = dummy / h_sparse_s[i][j][k];
}
// loop 3
// BACKWARD
#pragma omp for schedule(static, nx/NTt)
for (i=nx; i>=1;i--) for (j=ny; j>=1;j--) for (k=nz; k>=1;k--)
{
dummy = RLL[i][j][k]*h_sparse_s[i][j][k];
if (PeriodicBoundaryX && i==1) dummy -= COEFF[i][j][k][7] * RLL[nx ][j][k];
dummy -= COEFF[i][j][k][8] * RLL[i+1][j][k];
if (PeriodicBoundaryY && j==1) dummy -= COEFF[i][j][k][2] * RLL[i][ny ][k];
dummy -= COEFF[i][j][k][3] * RLL[i][j+1][k];
if (PeriodicBoundaryZ && k==1) dummy -= COEFF[i][j][k][4] * RLL[i][j][nz ];
dummy -= COEFF[i][j][k][5] * RLL[i][j][k+1];
RLL[i][j][k] = dummy / h_sparse_s[i][j][k];
}
}
[i][j][k]
的值是只读的,[i+1][i-1][j-1][j+1][k-1][k+1]
对p_sparse_s
的数据依赖性[i][j][k]
对[i-1][j-1][k-1]
的数据依赖性[i][j][k]
对[i+1][j+1][k+1]
的数据依赖性 COEFF[i][j][k][NUM]
只是为3d空间中的每个点定义的通用系数(一些常数)。由于存在9个与相邻点相对应的系数,因此COEFF[][][][0], COEFF[][][][1] .... COEFF[][][][8]
如此。
#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<omp.h>
typedef double lr;
#define nx 4
#define ny 4
#define nz 4
void
print3dmatrix(double a[nx+2][ny+2][nz+2])
{
for(int i=1; i<= nx; i++) {
for(int j=1; j<= ny; j++) {
for(int k=1; k<= nz; k++) {
printf("%f ", a[i][j][k]);
}
printf("\n");
}
printf("\n");
}
}
int
main()
{
double a[nx+2][ny+2][nz+2];
double b[nx+2][ny+2][nz+2];
srand(3461833726);
// matrix filling
// b is just a copy of a
for(int i=0; i< nx+2; i++) for(int j=0; j< ny+2; j++) for(int k=0; k< nz+2; k++)
{
a[i][j][k] = rand() % 5;
b[i][j][k] = a[i][j][k];
}
// loop 1
//#pragma omp parallel for num_threads(1)
for(int i=1; i<= nx; i++) for(int j=1; j<= ny; j++) for(int k=1; k<= nz; k++)
{
a[i][j][k] = -1*a[i-1][j][k] - 1*a[i][j-1][k] -1 * a[i][j][k-1] + 4 * a[i][j][k];
}
print3dmatrix(a);
printf("******************************\n");
// loop 2
//#pragma omp parallel for num_threads(1)
for(int i=1; i<= nx; i++)
for(int j=1; j<= ny; j++)
// #pragma omp simd
for(int m=j+1; m<= j+nz; m++)
{
b[i][j][m-j] = -1*b[i-1][j][m-j] - 1*b[i][j-1][m-j] -1 * b[i][j][m-j-1] + 4 * b[i][j][m-j];
}
print3dmatrix(b);
printf("=========================\n");
return 0;
}