使用MPI,我试图尽可能均匀地将随机大小的2D数组分布到随机数的进程中。例如,假设我们有一个10x10阵列和9个进程。我想要实现的目标如下:
________________________
| | | |
| P0 | P1 | P2 |
| 4x4 | 4x3 | 4x3 |
|________|______|______|
| P3 | P4 | P5 |
| 3x4 | 3x3 | 3x3 |
|________|______|______|
| P6 | P7 | P8 |
| 3x4 | 3x3 | 3x3 |
|________|______|______|
到目前为止,我已经设法创建了方便的过程网格并计算了全局数组(每个子数组的第一个元素)的正确位移,但是当发送所述子数组时我很难失败。
我已尝试同时使用MPI_Type_vector
和MPI_create_subarray
,但我必须遗漏一些重要内容,因为我无法让MPI_Scatterv
工作。
到目前为止,这是我的代码,请记住,我需要为2D数组操作提供一些讨厌的嵌套循环:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
// This will be command-line arguments
#define COLS 10
#define ROWS 10
int main(int argc, char **argv) {
int p, rank, i, j, proc;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &p);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// Let MPI decide the topology
int dims[2] = {0,0};
MPI_Dims_create(p, 2, dims);
int periods[2] = {0,0}; // non-periodic topology
int my_coords[2];
MPI_Comm comm_2D;
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &comm_2D);
MPI_Cart_coords(comm_2D, rank, 2, my_coords);
// Prepare the arrays and necessary info
char global_matrix[ROWS*COLS];
const int NPROWS = dims[0]; // Number of 'block' rows
const int NPCOLS = dims[1]; // Number of 'block' cols
int *num_rows; // Array containig the number of rows for each i-th process
int *num_cols; // As before
num_rows = (int *) malloc(p * sizeof(int));
num_cols = (int *) malloc(p * sizeof(int));
if (rank == 0) {
// Fill global matrix
for (i=0; i<ROWS*COLS; i++) {
global_matrix[i] = (char)i;
}
// Calculate the number of rows/cols for each process
for (i=0; i<p; i++) {
num_rows[i] = ROWS/NPROWS;
num_cols[i] = COLS/NPCOLS;
}
for (i=0; i<(ROWS%NPROWS); i++) {
for (j=0; j<NPCOLS; j++) {
num_rows[i*NPCOLS+j]++;
}
}
for (i=0; i<(COLS%NPCOLS); i++) {
for (j=0; j<NPROWS; j++) {
num_cols[i+NPROWS*j]++;
}
}
}
// Inform the processes about his local matrix size
MPI_Bcast(num_rows, p, MPI_INT, 0, comm_2D);
MPI_Bcast(num_cols, p, MPI_INT, 0, comm_2D);
// Define and initialize each local matrix
char local_matrix[num_rows[rank]*num_cols[rank]];
for (i=0; i<num_rows[rank]*num_cols[rank]; i++) {
local_matrix[i] = 0;
}
// Preparing for the Scatterv. Calculate displacements and number
// of elements to send to each process
int *disps = NULL;
int *counts = NULL;
if (rank == 0) {
disps = (int *) malloc(p * sizeof(int));
counts = (int *) malloc(p * sizeof(int));
for (i=0; i<NPROWS; i++) {
for (j=0; j<NPCOLS; j++) {
if (j == 0) {
// First block of the 'blockrow'
disps[i*NPCOLS+j] = i*COLS*num_rows[i*NPCOLS+j] + j*num_cols[i*NPCOLS+j];
} else {
// Rest of the blocks
disps[i*NPCOLS+j] = disps[i*NPCOLS+j - 1] + num_cols[i*NPCOLS+j - 1];
}
// This is VERY important and im not sure of it.
counts[i*NPCOLS+j] = 1; // 1 element to each process??
}
}
}
// Preparing the Datatypes for the Scatterv operation
MPI_Datatype tmp_matrix_t, global_matrix_t, local_matrix_t;
MPI_Type_vector(ROWS, 1, COLS, MPI_CHAR, &tmp_matrix_t);
MPI_Type_create_resized(tmp_matrix_t, 0, sizeof(char), &global_matrix_t);
MPI_Type_commit(&global_matrix_t);
MPI_Type_free(&tmp_matrix_t);
MPI_Type_vector(num_rows[rank], 1, num_cols[rank], MPI_CHAR, &tmp_matrix_t);
MPI_Type_create_resized(tmp_matrix_t, 0, sizeof(char), &local_matrix_t);
MPI_Type_commit(&local_matrix_t);
MPI_Type_free(&tmp_matrix_t);
// Doesn't work as expected
MPI_Scatterv(global_matrix, counts, disps, global_matrix_t,
local_matrix, 1, local_matrix_t, // receiving 1 element??
0, comm_2D);
// Testing/printing results
MPI_Barrier(comm_2D);
for (proc=0; proc<p; proc++) {
if (proc == rank) {
if (rank == 0) {
printf("Global matrix:\n");
for (i=0; i<ROWS; i++) {
printf("G: ");
for (j=0; j<COLS; j++) {
printf("%3d ", (int)global_matrix[i*COLS+j]);
}
printf("\n");
}
}
printf("Local matrix P%d:\n", rank);
for (i=0; i<num_rows[rank]; i++) {
printf("L%d: ", rank);
for (j=0; j<num_cols[rank]; j++) {
printf("%3d ", (int)local_matrix[i*num_cols[rank]+j]);
}
printf("\n");
}
}
}
MPI_Finalize();
return 0;
}
- 编辑:
再次查看此示例https://stackoverflow.com/a/7587133/4573730的代码@ Jonathan Dursi我开始认为您无法仅使用MPI_Scatterv
获得我想要的结果。
现在我认为最好的方法可能是支持点对点通信,或者使用MPI_Scatterv
和四个不同的通信器(因为我们最多有四个不同的子阵列大小),所以我们可以有相同的大小的发送/接收缓冲区。