我是MPI的新手。试图大致解决PDE。 1000 x 1000阵列。除了第一行和最后一行之外,在每次迭代时,每个元素都被更新为其8个邻居的平均值。
我的代码运行,但在使用不同数量的处理器的第三个小数位上的结果略有不同。我猜我的沟通正在失去精确度?我按行分割大数组,因为C ++逐行存储数组。
这是代码。
#include <iostream>
#include <mpi.h>
#include <cmath>
#include <stdio.h>
#include <stdlib.h>
int main(int argc, const char * argv[])
{
// Initialize the MPI environment
MPI_Init(NULL,NULL);
int p;
MPI_Comm_size(MPI_COMM_WORLD, &p);
int id;
MPI_Comm_rank(MPI_COMM_WORLD, &id);
const double pi = 3.1415926;
int n;
n=atoi(argv[argc-1]);
//calculate the starting and ending column indices
int m=floor((n-2)/p);
int r=n-2-m*p;
//dividing row 1 to row n-2 by rows since in c/c++ arrays are stored in row-wise
//therefore n-2=(m+1)*r+m*(p-r)
//the first r processors get m+1 rows, the rest get m rows
//starting row, ending row in the original A, width of matrix
int start_row, end_row, width;
if (id<=r-1){
start_row=1+id*(m+1);
end_row=start_row+m;
width=m+1;
}
else {
start_row=1+r*(m+1)+(id-r)*m;
end_row=start_row+m-1;
width=m;
}
//printf("mpi debug 1");
printf("on processor %d, starting row is %d, ending row is %d \n",id,start_row,end_row);
//id of the processor before and after
//id_before is not significant for id==0
//id_after is not
int id_before, id_after;
if (id==0)
id_before=p-1;
else
id_before=id-1;
if (id==p-1)
id_after=0;
else
id_after=id+1;
//printf("debug000");
//initialize the local matrix
//**** better way to initialize?
double a[width][n], b[width][n];
for (int i=0; i<width; i++)
for (int j=0; j<n; j++)
a[i][j]=0.5;
//two 1d arrays to store the halo cells
double halo_before[n], halo_after[n];
if (id==0){
for (int j=0; j<n; j++){
halo_before[j]=0.0;
}
}
if (id==p-1){
for (int j=0; j<n; j++) {
halo_after[j]=5*sin(M_PI*((double)j/n)*((double)j/n));
}
}
MPI_Barrier(MPI_COMM_WORLD);
//std::cout << " the sin function is" << 5*sin(M_PI*((double)1/1)*((double)1/2)) << "\n" <<std::endl;
//set id=0 to be the root processor and call
double start_time, end_time;
if (id==0)
start_time=MPI_Wtime();
MPI_Status status_before, status_after;
MPI_Request request_before, request_after;
/////////////////////////////////////////////////////////
//to dubug, print out arrays
//std::cout<< " the array on processor " << id <<" to start is \n" << std::endl;
//for (int i=0; i<width; i++){
// for (int j=0; j<n; j++){
// std::cout << a[i][j] << " ";
// if (j==n-1)
// std::cout << "\n" <<std::endl;
// }
//}
//to debug print out halos
//std::cout << "halo_before on processor " << id << " to start with is\n" << std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_before[i] << " ";
// if (i==n-1)
// std::cout <<"\n" <<std::endl;
//}
//std::cout << "halo_after on processor " << id << " to start with is\n" << std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_after[i] << " ";
// if (i==n-1)
// std::cout <<"\n" <<std::endl;
//}
//////////////////////////////////////////////////////
//begin iteration
for (int t=0; t<500; t++){
//unblocking send
//send first row to id_before:
//how should I use tag?
if (id>0){
MPI_Isend(&a[0][0], n, MPI_DOUBLE, id_before, t , MPI_COMM_WORLD, &request_before);
}
if (id<p-1){
//send the last row to id_after
MPI_Isend(&a[width-1][0], n, MPI_DOUBLE, id_after, t, MPI_COMM_WORLD, &request_after);
}
//printf("dubug0");
//update the entries that do not depend on halos
//local row=1 to row=width-2
//add if (width>3)??
int j_b, j_a;
for (int i=1; i<width-1; i++){
for (int j=0; j<n; j++){
j_b=(n+j-1)%n;
j_a=(j+1)%n;
b[i][j]=(a[i-1][j_b]+a[i-1][j]+a[i-1][j_a]+a[i][j_b]+a[i][j_a]+a[i+1][j_b]+a[i+1][j]+a[i+1][j_a])/8;
}
}
//printf("dubug1");
//blocking receive
//may consider unblocking receive
//receive from id_before and store in halo_before
//not sure about status
if (id>0){
MPI_Recv(&halo_before[0], n, MPI_DOUBLE, id_before ,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
//receive from id_after and store in halo_after
if (id<p-1){
MPI_Recv(&halo_after[0], n, MPI_DOUBLE, id_after,t , MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
//to debug print out halos
//std::cout << "halo_before on processor " << id << " at iteration " << t<< " is\n" <<std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_before[i] << " ";
// if (i==n-1)
// std::cout <<"\n" <<std::endl;
//}
//std::cout << "halo_after on processor " << id << " at iteration " << t<< " is\n" <<std::endl;
//for (int i=0;i<n;i++){
// std::cout << halo_after[i] << " ";
// if (i==n-1)
// std::cout <<"\n" <<std::endl;
//}
//update entries that depend on halos
//bugs here, what if width==1???
if (width==1){
for (int j=0; j<n; j++){
j_a=(n+j-1)%n;
j_b=(j+1)%n;
b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+halo_after[j_b]+halo_after[j]+halo_after[j_a])/8;
}
}
else{
for (int j=0; j<n; j++){
j_a=(n+j-1)%n;
j_b=(j+1)%n;
b[0][j]=(halo_before[j_b]+halo_before[j]+halo_before[j_a]+a[0][j_b]+a[0][j_a]+a[1][j_b]+a[1][j]+a[1][j_a])/8;
b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
}
}
//copy to b
//but make sure the send have been completed
if (id>0)
MPI_Wait(&request_before,MPI_STATUS_IGNORE);
if (id<p-1)
MPI_Wait(&request_after,MPI_STATUS_IGNORE);
for (int i=0; i<width; i++)
for (int j=0; j<n; j++)
a[i][j]=b[i][j];
//to dubug, print out arrays
//std::cout<< " the array on processor " << id <<" at iteration " << t <<" is \n"<< std::endl;
//for (int i=0; i<width; i++){
// for (int j=0; j<n; j++){
// std::cout << a[i][j] << " ";
// if (j==n-1)
// std::cout << "\n" <<std::endl;
// }
//}
}
//calculate the sum
double sum=0.0;
for (int i=0; i<width; i++)
sum += a[i][i+start_row];
double total_sum;
//send to root processor
MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
if (id==0){
end_time=MPI_Wtime();
//double sum_receive[p];
//double sum_calc;
//for (int i=0; i<p; i++){
// MPI_Recv(&sum_receive[i], 1, MPI_DOUBLE, i, i, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// sum_calc += sum_receive[i];
//}
printf("time elapse is %f \n", end_time-start_time);
printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(M_PI*((double)(n-1)/n)*((double)(n-1)/n)));
}
MPI_Finalize();
return 0;
}
答案 0 :(得分:3)
你的一行代码中有一个简单的拼写错误。这样:
b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_b]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
应为此(请注意第4个术语; j_a
,而不是第二个j_b
):
b[width-1][j]=(a[width-2][j_b]+a[width-2][j]+a[width-2][j_a]+a[width-1][j_b]+a[width-1][j_a]+halo_after[j_a]+halo_after[j]+halo_after[j_b])/8;
因为这发生在每个域的最后一行,所导致的确切错误数量取决于域边界 - 例如,您拥有多少处理器。
现在原因这样的错误在您发布的代码中基本上是不可避免的,即相同的计算 - 来自a - 通过一些修改发生不少于3次。这是一颗滴答作响的定时炸弹;最终会有一个更新,其他人会变得与第一个不一致,或者对其他人的更新最终会出现一些错误。
这里有许多减少复制量的方法,包括澄清代码和避免这些错误。最好是将光晕合并到a和b阵列中,在数据之前和之后添加额外的行以包含数据 - 这样你就不必担心宽度是否= = 1并且处理分别结束行。另外,定义一个函数,它根据a更新b的行或元素,并使用该函数而不是重复代码。
以下是清理代码的示例,该代码将位封装到例程中,与包含的光环区域一致地处理n和宽度等。
#include <mpi.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
int min2i(int a, int b) {
int result = a;
if (b < a) result = b;
return result;
}
void decomposition(const int n, const int nprocs, const int id,
int *start_row, int *width, int *id_before, int *id_after) {
const int nrows = n;
const int m = nrows/nprocs;
const int r = nrows % nprocs;
*width = m;
if (id < r) (*width)++;
*start_row = 1 + id*m + min2i(id,r);
*id_before = (id > 0 ? id-1 : MPI_PROC_NULL);
*id_after = (id < nprocs-1 ? id+1 : MPI_PROC_NULL);
}
void startBC(const int n, const int width, double a[][n+2], double b[][n+2],
const int id_before, const int id_after, const int t, MPI_Request *req) {
MPI_Isend(&a[1][1], n, MPI_DOUBLE, id_before, 2*t , MPI_COMM_WORLD, &req[0]);
MPI_Isend(&a[width][1], n, MPI_DOUBLE, id_after , 2*t+1, MPI_COMM_WORLD, &req[1]);
}
void finishBC(const int n, const int width, double a[][n+2], double b[][n+2],
const int id_before, const int id_after, const int t, MPI_Request *req) {
MPI_Status stats[2];
MPI_Recv(&a[0][1], n, MPI_DOUBLE, id_before, 2*t+1, MPI_COMM_WORLD, &stats[0]);
MPI_Recv(&a[width+1][1], n, MPI_DOUBLE, id_after, 2*t , MPI_COMM_WORLD, &stats[1]);
for (int i=0; i<width+2; i++) {
a[i][0] = a[i][n];
a[i][n+1] = a[i][1];
}
MPI_Waitall(2, req, stats);
}
void updateRow(const int n, const int width, double a[][n+2], double b[][n+2], const int row) {
for (int j=1; j<=n; j++)
b[row][j]=( a[row-1][j-1] + a[row-1][j] + a[row-1][j+1]
+a[ row ][j-1] + a[ row ][j+1]
+a[row+1][j-1] + a[row+1][j] + a[row+1][j+1])/8;
}
int main(int argc, const char * argv[])
{
int p, id;
MPI_Init(NULL,NULL);
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
const double pi = 3.1415926;
int n=atoi(argv[argc-1]);
int width, start_row, id_before, id_after;
decomposition(n, p, id, &start_row, &width, &id_before, &id_after);
double a[width+2][n+2], b[width+2][n+2];
for (int i=0; i<width+2; i++)
for (int j=0; j<n+2; j++)
a[i][j]=0.5;
if (id==p-1)
for (int j=0; j<n+2; j++)
a[width+1][j]=5*sin(pi*((double)(j-1)/n)*((double)(j-1)/n));
if (id==0)
for (int j=0; j<n+2; j++)
a[0][j]=0.;
double start_time, end_time;
if (id==0)
start_time=MPI_Wtime();
MPI_Request reqs[2];
//begin iteration
for (int t=0; t<2; t++){
startBC(n, width, a, b, id_before, id_after, t, reqs);
/* interior rows */
for (int row=2; row<width; row++)
updateRow(n, width, a, b, row);
finishBC(n, width, a, b, id_before, id_after, t, reqs);
/* boundary rows */
updateRow(n, width, a, b, 1);
updateRow(n, width, a, b, width);
for (int i=1; i<width+1; i++)
for (int j=1; j<n+1; j++)
a[i][j]=b[i][j];
}
//calculate the sum
double sum=0.0;
for (int i=1; i<width+1; i++)
sum += a[i][i+start_row-1];
double total_sum;
MPI_Reduce(&sum, &total_sum,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
if (id==0){
end_time=MPI_Wtime();
printf("time elapse is %f \n", end_time-start_time);
printf("at root processor %d, the calculated sum is %f, \n", id, total_sum+5*sin(pi*((double)(n-1)/n)*((double)(n-1)/n)));
}
MPI_Finalize();
return 0;
}