使用纯MPI,我的运行时间为3.7s
使用混合MPI + OpenMP,我的运行时变为4.1秒
纯MPI运行着16个节点,每个节点1个核心。
纯MPI带有1个节点,16个内核。
混合MPI + OpenMP运行了16个节点,每个节点16个内核
我已经尝试了printf调试,并且所有进程都按预期处理了16个内核。
任何见识都会有所帮助!
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myRank);
MPI_Comm_size(MPI_COMM_WORLD, &numNodes);
while ( dtMain > MAX_TEMP_ERROR && iteration <= max_iterations ) {
#pragma omp parallel for private(i, j)
for(i = 1; i <= ROWS; i++) {
for(j = 1; j <= COLUMNS; j++) {
Temperature[i][j] = 0.25 * (Temperature_last[i+1][j] + Temperature_last[i-1][j] +
Temperature_last[i][j+1] + Temperature_last[i][j-1]);
}
}
if(myRank != LAST) {
MPI_Send(&Temperature[ROWS][1], COLUMNS, MPI_DOUBLE, myRank + 1, 0, MPI_COMM_WORLD);
}
if(myRank != FIRST) {
MPI_Recv(&Temperature_last[0][1], COLUMNS, MPI_DOUBLE, myRank - 1, 0, MPI_COMM_WORLD, &status);
}
if(myRank != FIRST) {
MPI_Send(&Temperature[1][1], COLUMNS, MPI_DOUBLE, myRank - 1, 1, MPI_COMM_WORLD);
}
if(myRank != LAST) {
MPI_Recv(&Temperature_last[ROWS + 1][1], COLUMNS, MPI_DOUBLE, myRank + 1, 1, MPI_COMM_WORLD, &status);
}
dt = 0.0; // reset largest temperature change
#pragma omp parallel for private(i, j) reduction(max:dt)
for(i = 1; i <= ROWS; i++){
for(j = 1; j <= COLUMNS; j++){
dt = fmax( fabs(Temperature[i][j]-Temperature_last[i][j]), dt);
Temperature_last[i][j] = Temperature[i][j];
}
}
MPI_Allreduce(&dt, &dtMain, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
iteration++;
}
MPI_Barrier(MPI_COMM_WORLD);