我正在将3-D Jacobi求解器从纯MPI转换为混合MPI + OpenMP。我有一个192x192x192
数组,它在1-D
分解中的纯MPI中的24个进程之间进行划分,即每个进程都有192/24 x 192 x 192 = 8 x 192 x 192
个数据块。现在我做:
for(i=0 ; i <= 7; i++)
for(j=0; j<= 191; j++)
for(k=0; k<= 191; k++)
{
unew[i][j][k] = 1/6.0 * (u[i+1][j][k]+u[i-1][j][k]+
u[i][j+1][k]+u[i][j-1][k]+
u[i][j][k+1]+u[i][j][k-1]);
}
此更新每个进程大约需要60秒。
现在使用Hybrid MPI,我运行两个进程(每个进程1个进程--bind-to socket --map-by socket
和OMP_PROC_PLACES=cores
进程OMP_PROC_BIND=close
)。我为每个MPI进程创建了12个线程(即每个套接字或处理器12个线程)。现在每个MPI进程都有一个大小为192/2 x 192 x 192 = 96x192x192
元素的数组。每个线程都在每个进程拥有的数组的96/12 x 192 x 192 = 8 x 192 x 192
部分上工作。我使用线程进行相同的三次循环更新,但每个线程的时间大约为76秒。负载平衡在两个问题中都是完美的。 可能导致性能下降的原因是什么?是错误共享,因为线程可能使高速缓存行无效接近彼此的数据块?如果是,那么如何降低性能下降?(我故意没有提到重影数据,但最初我并没有重复与计算的通信。)
在回复以下评论时,我发布了代码。为长MWE道歉,但你可以非常安全地忽略(1)头文件声明(2)变量声明(3)内存分配例程(4)笛卡尔拓扑的形成(5)并行设置边界条件使用OpenMP并行区域(6)声明MPI_Type_subarray
数据类型(7)MPI_Isend()
和MPI_Irecv()
调用和只关注(a)INDEPENDENT UPDATE
OpenMP从这里调用并行区域(b)independent_update(...)
例程。
/* IGNORE THIS PORTION */
#include<mpi.h>
#include<omp.h>
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#define MIN(a,b) (a < b ? a : b)
#define Tol 0.00001
/* IGNORE THIS ROUTINE */
void input(int *X, int *Y, int *Z)
{
int a=193, b=193, c=193;
*X = a;
*Y = b;
*Z = c;
}
/* IGNORE THIS ROUTINE */
float*** allocate_mem(int X, int Y, int Z)
{
int i,j;
float ***matrix;
float *arr;
arr = (float*)calloc(X*Y*Z, sizeof(float));
matrix = (float***)calloc(X, sizeof(float**));
for(i = 0 ; i<= X-1; i++)
matrix[i] = (float**)calloc(Y, sizeof(float*));
for(i = 0 ; i <= X-1; i++)
for(j=0; j<= Y-1; j++)
matrix[i][j] = &(arr[i*Y*Z + j*Z]);
return matrix ;
}
/* THIS ROUTINE IS IMPORTANT */
float independent_update(float ***old, float ***new, int NX, int NY, int NZ, int tID, int chunk)
{
int i,j,k, start, end;
float error = 0.0;
float diff;
start = tID * chunk + 1;
end = MIN( (tID+1)*chunk, NX-2 );
for(i = start; i <= end ; i++)
{
for(j = 1; j<= NY-2; j++)
{
#pragma omp simd
for(k = 1; k<= NZ-2; k++)
{
new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
diff = 1.0 - new[i][j][k];
diff = (diff > 0 ? diff : -1.0 * diff );
if(diff > error)
error = diff;
}
}
}
return error;
}
int main(int argc, char *argv[])
{
/* IGNORE VARIABLE DECLARATION */
int size, rank; //Size of old_comm and rank of process
int i, j, k,l; //General loop variables
MPI_Comm old_comm, new_comm; //MPI_COMM_WORLD handle and for MPI_Cart_create()
int N[3]; //For taking input of size of matrix from user
int P; //Represent number of processes i.e. same as size
int dims[3]; //For dimensions of Cartesian topology
int PX, PY, PZ; //X dim, Y dim, Z dim of each process
float ***old, ***new, ***temp; //Matrices for results dimensions is (Px+2)*(PY+2)*(PZ+2)
int period[3]; //Periodicity for each dimension
int reorder; //Whether processes should be reordered in new cartesian topology
int ndims; //Number of dimensions (which is 3)
int Z_TOWARDS_U, Z_AWAY_U; //Z neighbour towards you and away from you (Z const)
int X_DOWN, X_UP; //Below plane and above plane (X const)
int Y_LEFT, Y_RIGHT; //Left plane and right plane (Y const)
int coords[3]; //Finding coordinates of processes
int dimension; //Used in MPI_Cart_shift() , values = 0, 1,2
int displacement; //Used in MPI_Cart_shift(), values will be +1 to find immediate neighbours
float l_max_err; //Local maximum error on process
float l_max_err_new; //For dependent faces.
float G_max_err = 1.0; //Maximum error for stopping criterion
int iterations = 0 ; //Counting number of iterations
MPI_Request send[6], recv[6]; //For MPI_Isend and MPI_Irecv
int start[3]; //Start will be defined in MPI_Isend() and MPI_Irecv()
int gsize[3]; //Defining global size of subarray
MPI_Datatype x_subarray; //For sending X_UP and X_DOWN
int local_x[3]; //Defining local plane size for X_UP/X_DOWN
MPI_Datatype y_subarray; //For sending Y_LEFT and Y_RIGHT
int local_y[3]; //Defining local plane for Y_LEFT/Y_RIGHT
MPI_Datatype z_subarray; //For sending Z_TOWARDS_U and Z_AWAY_U
int local_z[3]; //Defining local plan size for XY plane i.e. where Z=0
double strt, end; //For measuring time
double strt1, end1, delta1; //For measuring trivial time 1
double strt2, end2, delta2; //For measuring trivial time 2
double t_i_strt, t_i_end, t_i_sum=0; //Time for independent computational kernel
double t_up_strt, t_up_end, t_up_sum=0; //Time for X_UP
double t_down_strt, t_down_end, t_down_sum=0; //Time for X_DOWN
double t_left_strt, t_left_end, t_left_sum=0; //Time for Y_LEFT
double t_right_strt, t_right_end, t_right_sum=0; //Time for Y_RIGHT
double t_towards_strt, t_towards_end, t_towards_sum=0; //For Z_TOWARDS_U
double t_away_strt, t_away_end, t_away_sum=0; //For Z_AWAY_U
double t_comm_strt, t_comm_end, t_comm_sum=0; //Time comm + independent update (need to subtract to get comm time)
double t_setup_strt,t_setup_end; //Set-up start and end time
double t_allred_strt,t_allred_end,t_allred_total=0.0; //Measuring Allreduce time separately.
int threadID; //ID of a thread
int nthreads; //Total threads in OpenMP region
int chunk; //chunk - used to calculate iterations of a thread
/* IGNORE MPI STARTUP ETC */
MPI_Init(&argc, &argv);
t_setup_strt = MPI_Wtime();
old_comm = MPI_COMM_WORLD;
MPI_Comm_size(old_comm, &size);
MPI_Comm_rank(old_comm, &rank);
P = size;
if(rank == 0)
{
input(&N[0], &N[1], &N[2]);
}
MPI_Bcast(N, 3, MPI_INT, 0, old_comm);
dims[0] = 0;
dims[1] = 0;
dims[2] = 0;
period[0] = period[1] = period[2] = 0; //All dimensions aperiodic
reorder = 0 ; //No reordering of ranks in new_comm
ndims = 3;
MPI_Dims_create(P,ndims,dims);
MPI_Cart_create(old_comm, ndims, dims, period, reorder, &new_comm);
if( (N[0]-1) % dims[0] == 0 && (N[1]-1) % dims[1] == 0 && (N[2]-1) % dims[2] == 0 )
{
PX = (N[0]-1)/dims[0]; //Rows of unknowns each process gets
PY = (N[1]-1)/dims[1]; //Columns of unknowns each process gets
PZ = (N[2]-1)/dims[2]; //Depth of unknowns each process gets
}
old = allocate_mem(PX+2, PY+2, PZ+2); //3D arrays with ghost points
new = allocate_mem(PX+2, PY+2, PZ+2); //3D arrays with ghost points
dimension = 0;
displacement = 1;
MPI_Cart_shift(new_comm, dimension, displacement, &X_UP, &X_DOWN); //Find UP and DOWN neighbours
dimension = 1;
MPI_Cart_shift(new_comm, dimension, displacement, &Y_LEFT, &Y_RIGHT); //Find UP and DOWN neighbours
dimension = 2;
MPI_Cart_shift(new_comm, dimension, displacement, &Z_TOWARDS_U, &Z_AWAY_U); //Find UP and DOWN neighbours
/* IGNORE BOUNDARY SETUPS FOR PDE */
#pragma omp parallel for default(none) shared(old,new,PX,PY,PZ) private(i,j,k) schedule(static)
for(i = 0; i <= PX+1; i++)
{
for(j = 0; j <= PY+1; j++)
{
for(k = 0; k <= PZ+1; k++)
{
old[i][j][k] = 0.0;
new[i][j][k] = 0.0;
}
}
}
#pragma omp parallel default(none) shared(X_DOWN,X_UP,Y_LEFT,Y_RIGHT,Z_TOWARDS_U,Z_AWAY_U,old,new,PX,PY,PZ) private(i,j,k,threadID,nthreads)
{
threadID = omp_get_thread_num();
nthreads = omp_get_num_threads();
if(threadID == 0)
{
if(X_DOWN == MPI_PROC_NULL) //X is constant here, this is YZ upper plane
{
for(j = 1 ; j<= PY ; j++)
for(k = 1 ; k<= PZ ; k++)
{
old[0][j][k] = 1;
new[0][j][k] = 1; //Set boundaries in new also
}
}
}
if(threadID == (nthreads-1))
{
if(X_UP == MPI_PROC_NULL) //YZ lower plane
{
for(j = 1 ; j<= PY ; j++)
for(k = 1; k<= PZ ; k++)
{
old[PX+1][j][k] = 1;
new[PX+1][j][k] = 1;
}
}
}
if(Y_LEFT == MPI_PROC_NULL) //Y is constant, this is left XZ plane, possibly can use collapse(2)
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX ; i++)
for(k = 1; k<= PZ; k++)
{
old[i][0][k] = 1;
new[i][0][k] = 1;
}
}
if(Y_RIGHT == MPI_PROC_NULL) //XZ right plane, again collapse(2) potential
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX; i++)
for(k = 1; k<= PZ ; k++)
{
old[i][PY+1][k] = 1;
new[i][PY+1][k] = 1;
}
}
if(Z_TOWARDS_U == MPI_PROC_NULL) //Z is constant here, towards you XY plane, collapse(2)
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX ; i++)
for(j = 1; j<= PY ; j++)
{
old[i][j][0] = 1;
new[i][j][0] = 1;
}
}
if(Z_AWAY_U == MPI_PROC_NULL) //Away from you XY plane, collapse(2)
{
#pragma omp for schedule(static)
for(i = 1 ; i<= PX; i++)
for(j = 1; j<= PY ; j++)
{
old[i][j][PZ+1] = 1;
new[i][j][PZ+1] = 1;
}
}
}
/* IGNORE SUBARRAY DECLARATION */
gsize[0] = PX+2; //Global sizes of 3-D cubes for each process
gsize[1] = PY+2;
gsize[2] = PZ+2;
start[0] = 0; //Will specify starting location while sending/receiving
start[1] = 0;
start[2] = 0;
local_x[0] = 1;
local_x[1] = PY;
local_x[2] = PZ;
MPI_Type_create_subarray(ndims, gsize, local_x, start, MPI_ORDER_C, MPI_FLOAT, &x_subarray);
MPI_Type_commit(&x_subarray);
local_y[0] = PX;
local_y[1] = 1;
local_y[2] = PZ;
MPI_Type_create_subarray(ndims, gsize, local_y, start, MPI_ORDER_C, MPI_FLOAT, &y_subarray);
MPI_Type_commit(&y_subarray);
local_z[0] = PX;
local_z[1] = PY;
local_z[2] = 1;
MPI_Type_create_subarray(ndims, gsize, local_z, start, MPI_ORDER_C, MPI_FLOAT, &z_subarray);
MPI_Type_commit(&z_subarray);
t_setup_end = MPI_Wtime();
strt = MPI_Wtime();
while(G_max_err > Tol) //iterations < ITERATIONS)
{
iterations++ ;
t_comm_strt = MPI_Wtime();
/* IGNORE MPI COMMUNICATION */
MPI_Irecv(&old[0][1][1], 1, x_subarray, X_DOWN, 10, new_comm, &recv[0]);
MPI_Irecv(&old[PX+1][1][1], 1, x_subarray, X_UP, 20, new_comm, &recv[1]);
MPI_Irecv(&old[1][PY+1][1], 1, y_subarray, Y_RIGHT, 30, new_comm, &recv[2]);
MPI_Irecv(&old[1][0][1], 1, y_subarray, Y_LEFT, 40, new_comm, &recv[3]);
MPI_Irecv(&old[1][1][PZ+1], 1, z_subarray, Z_AWAY_U, 50, new_comm, &recv[4]);
MPI_Irecv(&old[1][1][0], 1, z_subarray, Z_TOWARDS_U, 60, new_comm, &recv[5]);
MPI_Isend(&old[PX][1][1], 1, x_subarray, X_UP, 10, new_comm, &send[0]);
MPI_Isend(&old[1][1][1], 1, x_subarray, X_DOWN, 20, new_comm, &send[1]);
MPI_Isend(&old[1][1][1], 1, y_subarray, Y_LEFT, 30, new_comm, &send[2]);
MPI_Isend(&old[1][PY][1], 1, y_subarray, Y_RIGHT, 40, new_comm, &send[3]);
MPI_Isend(&old[1][1][1], 1, z_subarray, Z_TOWARDS_U, 50, new_comm, &send[4]);
MPI_Isend(&old[1][1][PZ], 1, z_subarray, Z_AWAY_U, 60, new_comm, &send[5]);
MPI_Waitall(6, send, MPI_STATUSES_IGNORE);
MPI_Waitall(6, recv, MPI_STATUSES_IGNORE);
t_comm_end = MPI_Wtime();
t_comm_sum = t_comm_sum + (t_comm_end - t_comm_strt);
/* Use threads in Independent update */
t_i_strt = MPI_Wtime();
l_max_err = 0.0; //Very important, Reduction result is combined with this !
/* THIS IS THE IMPORTANT REGION */
#pragma omp parallel default(none) shared(old,new,PX,PY,PZ,chunk) private(threadID,nthreads) reduction(max:l_max_err)
{
nthreads = omp_get_num_threads();
threadID = omp_get_thread_num();
chunk = (PX-1+1) / nthreads ;
l_max_err = independent_update(old, new, PX+2, PY+2, PZ+2, threadID, chunk);
}
t_i_end = MPI_Wtime();
t_i_sum = t_i_sum + (t_i_end - t_i_strt) ;
/* IGNORE THE REMAINING CODE */
t_allred_strt = MPI_Wtime();
MPI_Allreduce(&l_max_err, &G_max_err, 1, MPI_FLOAT, MPI_MAX, new_comm);
t_allred_end = MPI_Wtime();
t_allred_total = t_allred_total + (t_allred_end - t_allred_strt);
temp = new ;
new = old;
old = temp;
}
MPI_Barrier(new_comm);
end = MPI_Wtime();
if( rank == 0)
{
printf("\nIterations = %d, G_max_err = %f", iterations, G_max_err);
printf("\nThe total SET-UP time for MPI and boundary conditions is %lf", (t_setup_end-t_setup_strt));
printf("\nThe total time for SOLVING is %lf", (end-strt));
printf("\nThe total time for INDEPENDENT COMPUTE %lf", t_i_sum);
printf("\nThe total time for COMMUNICATION OVERHEAD is %lf", t_comm_sum);
printf("\nThe total time for MPI_ALLREDUCE() is %lf", t_allred_total);
}
MPI_Type_free(&x_subarray);
MPI_Type_free(&y_subarray);
MPI_Type_free(&z_subarray);
free(&old[0][0][0]);
free(&new[0][0][0]);
MPI_Finalize();
return 0;
}
P.S。 :我几乎可以肯定,产生/唤醒线程的成本并不是时间上如此巨大差异的原因。 请查找随附的Scalasca快照,了解混合程序的独立计算。
使用循环simd构造
#pragma omp parallel default(none) shared(old,new,PX,PY,PZ,l_max_err) private(i,j,k,diff)
{
#pragma omp for simd schedule(static) reduction(max:l_max_err)
for(i = 1; i <= PX ; i++)
{
for(j = 1; j<= PY; j++)
{
for(k = 1; k<= PZ; k++)
{
new[i][j][k] = (1/6.0) *(old[i-1][j][k] + old[i+1][j][k] + old[i][j-1][k] + old[i][j+1][k] + old[i][j][k-1] + old[i][j][k+1] );
diff = 1.0 - new[i][j][k];
diff = (diff > 0 ? diff : -1.0 * diff );
if(diff > l_max_err)
l_max_err = diff;
}
}
}
}