Question

我是这个主题的新手，需要有关如何并行化代码的帮助。我有一个大的1D阵列，实际上描述了3D体积：21x21x21单精度值。我有3台计算机，我想参与计算。对网格（卷）中的每个单元格执行的操作对于所有单元格都是相同的。程序接收一些数据并对它们执行一些简单的算术，并将返回值分配给网格单元。

我的非平行代码是：

float zg, yg, xg;
stack_result = new float[Nz*Ny*Nx];
// StrMtrx[8] is the vertical step size, StrMtrx[6]  is the vertical starting point 
for (int iz=0; iz<Nz; iz++) {
  zg = iz*StRMtrx[8]+StRMtrx[6];  // find the vertical position in meters
  // StrMtrx[5] is the crossline step size, StrMtrx[3]  is the crossline starting point
  for (int iy=0; iy<Ny; iy++) {
    yg = iy*StRMtrx[5]+StRMtrx[3];  // find the crossline position
    // StrMtrx[2] is the inline step size, StrMtrx[0]  is the inline starting point
    for (int ix=0; ix < nx; ix++) { 
      xg = ix*StRMtrx[2]+StRMtrx[0]; // find the inline position
      // do stacking on each grid cell
      // "Geoph" is the geophone ids, "Ngeo" is the number of geophones involved,
      // "pahse_use" is the wave type, "EnvMtrx" is the input data common to all
      // cells, "Mdata" is the length of input data
      stack_result[ix+Nx*iy+Nx*Ny*iz] =
        stack_for_qds(Geoph, Ngeo, phase_use, xg, yg, zg, EnvMtrx, Mdata);  
    }        
  }
}

现在我接收了3台计算机并将音量分成3个垂直段，因此我将有3个子卷，每个21x21x7个单元。（注意，卷的解析是z，y，x）。变量“stack_result”是完整的卷。我的并行版本（完全失败，我只收回其中一个子卷）是：

MPI_Status status;
int rank, numProcs, rootProcess;
ierr = MPI_Init(&argc, &argv);
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &numProcs);
int rowsInZ = Nz/numProcs;  // 7 cells in Z (vertical)
int chunkSize = Nx*Ny*rowsInZ;
float *stack_result = new float[Nz*Ny*Nx];
float zg, yg, xg;
rootProcess = 0;
if(rank == rootProcess) {
  offset = 0;
  for (int n = 1; n < numProcs; n++) { 
    // send rank
    MPI_Send(&n, 1, MPI_INT, n, 2, MPI_COMM_WORLD);
    // send the offset in array
    MPI_Send(&offset, 1, MPI_INT, n, 2, MPI_COMM_WORLD);
    // send volume, now only filled with zeros,
    MPI_Send(&stack_result[offset], chunkSize, MPI_FLOAT, n, 1, MPI_COMM_WORLD);
    offset = offset+chunkSize;
  }
  // receive results
  for (int n = 1; n < numProcs; n++) { 
    int source = n;
    MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
    MPI_Recv(&stack_result[offset], chunkSize, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);
  }
}  else {
  int rank;
  int source = 0;
  int ierr = MPI_Recv(&rank, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
  ierr = MPI_Recv(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD, &status);
  ierr = MPI_Recv(&stack_result[offset], chunkSize, MPI_FLOAT, source, 1, MPI_COMM_WORLD, &status);       
  int nz = rowsInZ;  // sub-volume vertical length
  int startZ = (rank-1)*rowsInZ;
  for (int iz = startZ; iz < startZ+nz; iz++) {
    zg = iz*StRMtrx[8]+StRMtrx[6];
    for (int iy = 0; iy < Ny; iy++) {
      yg = iy*StRMtrx[5]+StRMtrx[3];
      for (int ix = 0; ix < Nx; ix++) {
        xg = ix*StRMtrx[2]+StRMtrx[0];
        stack_result[offset+ix+Nx*iy+Nx*Ny*iz]=
          stack_for_qds(Geoph, Ngeo, phase_use, xg, yg, zg, EnvMtrx, Mdata);
      }  // x-loop
    }  // y-loop
  }   // z-loop
  MPI_Send(&offset, 1, MPI_INT, source, 2, MPI_COMM_WORLD);
  MPI_Send(&stack_result[offset], chunkSize, MPI_FLOAT, source, 1, MPI_COMM_WORLD);
}  // else
write("stackresult.dat", stack_result);
delete [] stack_result;
MPI_Finalize();

提前感谢您的耐心等待。

Answer 1

您在所有MPI排名中呼叫write("stackresult.dat", stack_result);。结果，它们都写入并因此覆盖相同的文件，您看到的是由最后一个MPI进程执行该代码语句所写的内容。您应该将写入移动到if (rank == rootProcess)条件的主体中，以便只有根进程才会写入。

作为旁注，发送排名的值是多余的 - MPI已经为每个进程分配了从0到#processes - 1的排名。这也使得偏移的发送变得多余，因为每个MPI过程可以根据其等级轻松地自己计算偏移量。

MPI有关如何并行化代码的帮助

1 个答案: