Question

我有一个2D数组，它分布在MPI流程网格中（本例中为3 x 2个流程）。数组的值是在分发该数组的进程的过程中生成的，我想在根进程中收集所有这些块以显示它们。

到目前为止，我有以下代码。这会生成一个笛卡尔通信器，找出MPI过程的坐标，并根据它计算出应该得到多少数组（因为数组不需要是笛卡尔网格大小的倍数）。然后我创建一个新的MPI派生数据类型，它将整个进程子数组作为一个项发送（也就是说，每个进程的步幅，块长度和计数都不同，因为每个进程都有不同大小的数组）。但是，当我和MPI_Gather一起收集数据时，我遇到了分段错误。

我认为这是因为我不应该在MPI_Gather调用中使用相同的数据类型进行发送和接收。数据类型适用于发送数据，因为它具有正确的计数，步幅和块长度，但是当它到达另一端时，它将需要一个非常不同的派生数据类型。我不确定如何计算此数据类型的参数 - 有没有人有任何想法？

另外，如果我从完全错误的角度接近这个，请告诉我！

#include<stdio.h>
#include<array_alloc.h>
#include<math.h>
#include<mpi.h>

int main(int argc, char ** argv)
{
    int size, rank;
    int dim_size[2];
    int periods[2];
    int A = 2;
    int B = 3;
    MPI_Comm cart_comm;
    MPI_Datatype block_type;
    int coords[2];

    float **array;
    float **whole_array;

    int n = 10;
    int rows_per_core;
    int cols_per_core;
    int i, j;

    int x_start, x_finish;
    int y_start, y_finish;

    /* Initialise MPI */
    MPI_Init(&argc, &argv);

    /* Get the rank for this process, and the number of processes */
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank == 0)
    {
        /* If we're the master process */
        whole_array = alloc_2d_float(n, n);

        /* Initialise whole array to silly values */
        for (i = 0; i < n; i++)
        {
            for (j = 0; j < n; j++)
            {
                whole_array[i][j] = 9999.99;
            }
        }

        for (j = 0; j < n; j ++)
        {
            for (i = 0; i < n; i++)
            {
                printf("%f ", whole_array[j][i]);
            }
            printf("\n");
        }
    }

    /* Create the cartesian communicator */
    dim_size[0] = B;
    dim_size[1] = A;
    periods[0] = 1;
    periods[1] = 1;

    MPI_Cart_create(MPI_COMM_WORLD, 2, dim_size, periods, 1, &cart_comm);

    /* Get our co-ordinates within that communicator */
    MPI_Cart_coords(cart_comm, rank, 2, coords);

    rows_per_core = ceil(n / (float) A);
    cols_per_core = ceil(n / (float) B);

    if (coords[0] == (B - 1))
    {
        /* We're at the far end of a row */
        cols_per_core = n - (cols_per_core * (B - 1));
    }
    if (coords[1] == (A - 1))
    {
        /* We're at the bottom of a col */
        rows_per_core = n - (rows_per_core * (A - 1));
    }

    printf("X: %d, Y: %d, RpC: %d, CpC: %d\n", coords[0], coords[1], rows_per_core, cols_per_core);

    MPI_Type_vector(rows_per_core, cols_per_core, cols_per_core + 1, MPI_FLOAT, &block_type);
    MPI_Type_commit(&block_type);

    array = alloc_2d_float(rows_per_core, cols_per_core);

    if (array == NULL)
    {
        printf("Problem with array allocation.\nExiting\n");
        return 1;
    }

    for (j = 0; j < rows_per_core; j++)
    {
        for (i = 0; i < cols_per_core; i++)
        {
            array[j][i] = (float) (i + 1);
        }
    }

    MPI_Barrier(MPI_COMM_WORLD);

    MPI_Gather(array, 1, block_type, whole_array, 1, block_type, 0, MPI_COMM_WORLD);

    /*
    if (rank == 0)
    {
        for (j = 0; j < n; j ++)
        {
            for (i = 0; i < n; i++)
            {
                printf("%f ", whole_array[j][i]);
            }
            printf("\n");
        }
    }
    */
    /* Close down the MPI environment */
    MPI_Finalize();
}

我上面使用的2D数组分配程序实现为：

float **alloc_2d_float( int ndim1, int ndim2 ) {

  float **array2 = malloc( ndim1 * sizeof( float * ) );

  int i;

  if( array2 != NULL ){

    array2[0] = malloc( ndim1 * ndim2 * sizeof( float ) );

    if( array2[ 0 ] != NULL ) {

      for( i = 1; i < ndim1; i++ )
    array2[i] = array2[0] + i * ndim2;

    }

    else {
      free( array2 );
      array2 = NULL;
    }

  }

  return array2;

}

Answer 1

这是一个棘手的问题。你走在正确的轨道上，是的，你需要不同类型的发送和接收。

发送部分很简单 - 如果您发送整个子数组array，那么您甚至不需要矢量类型;如果您愿意，可以从(rows_per_core)*(cols_per_core)（或&(array[0][0])开始发送整个array[0]个连续的浮点数。

这是接收，这是你收集的棘手部分。让我们从最简单的情况开始 - 假设所有内容均匀分布，因此所有块都具有相同的大小。然后你可以使用非常好的MPI_Type_create_subarray（你总是可以将它与矢量类型一起拼凑起来，但是对于更高维的数组，这变得乏味，因为你需要为数组的每个维度创建1个中间类型，除了最后一个...

此外，您可以使用同样有用的MPI_Dims_create来创建尽可能正方形的排名分解，而不是对分解进行硬编码。注意这不一定与MPI_Cart_create有任何关系，尽管您可以将它用于请求的维度。我将在这里跳过cart_create的东西，不是因为它没用，而是因为我想专注于收集东西。

因此，如果每个人都具有相同的array大小，那么root会从每个人那里接收相同的数据类型，并且可以使用非常简单的子阵列类型来获取他们的数据：

MPI_Type_create_subarray(2, whole_array_size, sub_array_size, starts,
                         MPI_ORDER_C, MPI_FLOAT, &block_type);
MPI_Type_commit(&block_type);

其中sub_array_size[] = {rows_per_core, cols_per_core}，whole_array_size[] = {n,n}和starts[]={0,0} - 例如，我们只是假设一切都从一开始就开始了。这样做的原因是我们可以使用Gatherv将位移显式设置到数组中：

for (int i=0; i<size; i++) {
    counts[i] = 1;   /* one block_type per rank */

    int row = (i % A);
    int col = (i / A);
    /* displacement into the whole_array */
    disps[i] = (col*cols_per_core + row*(rows_per_core)*n);
}

MPI_Gatherv(array[0], rows_per_core*cols_per_core, MPI_FLOAT,
            recvptr, counts, disps, resized_type, 0, MPI_COMM_WORLD);

所以现在每个人都在一个块中发送他们的数据，并且它被接收到数组右侧的类型中。为了实现这一点，我调整了类型的大小，使其范围只是一个浮点数，因此可以在该单位中计算位移：

MPI_Type_create_resized(block_type, 0, 1*sizeof(float), &resized_type);
MPI_Type_commit(&resized_type);

整个代码如下：

#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<mpi.h>

float **alloc_2d_float( int ndim1, int ndim2 ) {
    float **array2 = malloc( ndim1 * sizeof( float * ) );
    int i;

    if( array2 != NULL ){
        array2[0] = malloc( ndim1 * ndim2 * sizeof( float ) );
        if( array2[ 0 ] != NULL ) {
            for( i = 1; i < ndim1; i++ )
                array2[i] = array2[0] + i * ndim2;
        }

        else {
            free( array2 );
            array2 = NULL;
        }
    }
    return array2;
}

void free_2d_float( float **array ) {
    if (array != NULL) {
        free(array[0]);
        free(array);
    }
    return;
}

void init_array2d(float **array, int ndim1, int ndim2, float data) {
    for (int i=0; i<ndim1; i++) 
        for (int j=0; j<ndim2; j++)
            array[i][j] = data;
    return;
}

void print_array2d(float **array, int ndim1, int ndim2) {
    for (int i=0; i<ndim1; i++) {
        for (int j=0; j<ndim2; j++) {
            printf("%6.2f ", array[i][j]);
        }
        printf("\n");
    }
    return;
}


int main(int argc, char ** argv)
{
    int size, rank;
    int dim_size[2];
    int periods[2];
    MPI_Datatype block_type, resized_type;

    float **array;
    float **whole_array;
    float *recvptr;

    int *counts, *disps;

    int n = 10;
    int rows_per_core;
    int cols_per_core;
    int i, j;

    int whole_array_size[2];
    int sub_array_size[2];
    int starts[2];
    int A, B;

    /* Initialise MPI */
    MPI_Init(&argc, &argv);

    /* Get the rank for this process, and the number of processes */
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    if (rank == 0)
    {
        /* If we're the master process */
        whole_array = alloc_2d_float(n, n);
        recvptr = &(whole_array[0][0]);

        /* Initialise whole array to silly values */
        for (i = 0; i < n; i++)
        {
            for (j = 0; j < n; j++)
            {
                whole_array[i][j] = 9999.99;
            }
        }

        print_array2d(whole_array, n, n);
        puts("\n\n");
    }

    /* Create the cartesian communicator */
    MPI_Dims_create(size, 2, dim_size);
    A = dim_size[1];
    B = dim_size[0];
    periods[0] = 1;
    periods[1] = 1;

    rows_per_core = ceil(n / (float) A);
    cols_per_core = ceil(n / (float) B);
    if (rows_per_core*A != n) {
        if (rank == 0) fprintf(stderr,"Aborting: rows %d don't divide by %d evenly\n", n, A);
        MPI_Abort(MPI_COMM_WORLD,1);
    }
    if (cols_per_core*B != n) {
        if (rank == 0) fprintf(stderr,"Aborting: cols %d don't divide by %d evenly\n", n, B);
        MPI_Abort(MPI_COMM_WORLD,2);
    }

    array = alloc_2d_float(rows_per_core, cols_per_core);
    printf("%d, RpC: %d, CpC: %d\n", rank, rows_per_core, cols_per_core);

    whole_array_size[0] = n;             
    sub_array_size  [0] = rows_per_core; 
    whole_array_size[1] = n;
    sub_array_size  [1] = cols_per_core;
    starts[0] = 0; starts[1] = 0;

    MPI_Type_create_subarray(2, whole_array_size, sub_array_size, starts, 
                             MPI_ORDER_C, MPI_FLOAT, &block_type);
    MPI_Type_commit(&block_type);
    MPI_Type_create_resized(block_type, 0, 1*sizeof(float), &resized_type);
    MPI_Type_commit(&resized_type);

    if (array == NULL)
    {
        printf("Problem with array allocation.\nExiting\n");
        MPI_Abort(MPI_COMM_WORLD,3);
    }

    init_array2d(array,rows_per_core,cols_per_core,(float)rank);

    counts = (int *)malloc(size * sizeof(int));
    disps  = (int *)malloc(size * sizeof(int));
    /* note -- we're just using MPI_COMM_WORLD rank here to
     * determine location, not the cart_comm for now... */
    for (int i=0; i<size; i++) {
        counts[i] = 1;   /* one block_type per rank */

        int row = (i % A);
        int col = (i / A);
        /* displacement into the whole_array */
        disps[i] = (col*cols_per_core + row*(rows_per_core)*n);
    }

    MPI_Gatherv(array[0], rows_per_core*cols_per_core, MPI_FLOAT, 
                recvptr, counts, disps, resized_type, 0, MPI_COMM_WORLD);

    free_2d_float(array);
    if (rank == 0) print_array2d(whole_array, n, n);
    if (rank == 0) free_2d_float(whole_array);
    MPI_Finalize();
}

小事 - 你在聚会前不需要屏障。事实上，你几乎不需要一个障碍，而且由于一些原因它们是昂贵的操作，并且可以隐藏问题 - 我的经验法则是永远不会使用障碍，除非你确切知道规则需要为什么在这种情况下打破了。特别是在这种情况下，集合gather例程与屏障完全相同，所以只需使用它。

现在，转向更难的东西。如果不的内容均匀划分，您可以选择几个选项。最简单的，但不一定是最好的，只是填充数组，使均匀分配，即使只是为了这个操作。

如果你可以安排它以便列数均匀分配，即使行数没有，那么你仍然可以使用gatherv并为行的每个部分创建一个向量类型，并收集每个处理器的适当行数。那会很好。

如果你肯定有这样的情况，既不能指望分割，又你无法填充数据进行发送，那么我可以看到三个子选项：

正如susterpatt建议的那样，做点对点。对于少量任务，这很好，但随着它变大，这将比集体操作效率低得多。
创建一个由不在外边缘的所有处理器组成的通信器，并使用上面的代码来收集它们的代码;然后点对点边缘任务的数据。
根本不收集处理0;使用Distributed array type来描述数组的布局，并使用MPI-IO将所有数据写入文件;完成后，如果您愿意，可以通过某种方式将过程零显示在数据上。

Answer 2

看起来你MPI_Gather来电的第一个参数应该是array[0]，而不是array。

此外，如果您需要从每个排名中获取不同数量的数据，最好使用MPI_Gatherv。

最后，并非在一次收集所有数据以进行输出在许多情况下都不可扩展。随着数据量的增长，最终它将超过可用于排名0的内存。分发输出工作（如果您使用MPI IO或其他库调用写入文件）或执行点操作可能会好得多 - to-point一次发送一个等级0，以限制总内存消耗。

另一方面，我不建议将每个等级打印到标准输出，一个接一个地协调，因为一些主要的MPI实现不保证将按顺序生成标准输出。特别是如果打印多个等级，Cray的MPI会非常彻底地混淆标准输出。

Answer 3

根据this（我强调）：

集合操作的类型匹配条件比发送方和接收方之间的点对点条件更严格。即，对于集体操作，，发送的数据量必须与接收器指定的数据量完全匹配。发送者和接收者之间仍然允许使用不同类型的地图。

听起来我有两个选择：

填充较小的子矩阵，以便所有进程发送相同数量的数据，然后在收集后将矩阵裁剪回其原始大小。如果您喜欢冒险，可以尝试定义接收类型图，以便在Gather操作期间自动覆盖填充，从而消除了之后的裁剪需求。这可能会有点复杂。
回归点对点沟通。更直接，但可能更高的沟通成本。

就个人而言，我会选择2。

将2D阵列的分布式块发送到MPI中的根进程

3 个答案: