我有一个2D矩阵,假设这个矩阵是下面的4x6矩阵:
1 2 3 4 5 6
7 8 9 10 11 12
13 14 15 16 17 18
19 20 21 22 23 24
我希望4个进程中的每个进程获得一个2x3子矩阵并将其放在4x5缓冲区中。
类似的东西:
0 0 0 0 0
0 1 2 3 0
0 7 8 9 0
0 0 0 0 0
0 0 0 0 0
0 4 5 6 0
0 10 11 12 0
0 0 0 0 0
0 0 0 0 0
0 13 14 15 0
0 19 20 21 0
0 0 0 0 0
0 0 0 0 0
0 16 17 18 0
0 22 23 24 0
0 0 0 0 0
额外的行不是问题,因为我可以调用MPI_Scatterv并指向进程缓冲区的第二行,但额外的列会使事情变得复杂。 这只能使用MPI数据类型和一个MPI_Scatterv调用吗?如果是,请给我一些指导。
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
dim[0] = dim[1] = sqrt(numtasks);
periods[0] = periods[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, periods, 1, &commCart);
MPI_Comm_rank(commCart, &taskid);
NPROWS = dim[0];
NPCOLS = dim[1];
blockRows = ROWS / NPROWS;
blockColumns = COLS / NPCOLS;
if (taskid == MASTER) {
for(i=0;i<ROWS*COLS;i++){
global[i]=i;
}
}
float* local;
local = malloc(blockRows * (blockColumns+2) * sizeof (float));
for (i = 0; i < blockRows * (blockColumns+2); i++) {
local[0][i] = 0;
local[1][i] = 0;
}
MPI_Datatype type, resizedtype,column;
int sizes[2] = {ROWS,COLS};
int subsizes[2] = {blockRows,blockColumns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_FLOAT, &type);
MPI_Type_create_resized(type, 0, blockColumns*sizeof(float), &resizedtype);
MPI_Type_commit(&resizedtype);
int *counts = malloc(numworkers*sizeof(int));
int *displs = malloc(numworkers*sizeof(int));
for(i=0;i<numworkers;i++){
counts[i] = 1;
}
int disp = 0;
for(i=0;i<NPROWS;i++){
for(j=0;j<NPCOLS;j++){
displs[i*dim[0] + j] = disp;
disp++;
}
disp += (blockColumns-1)*dim[0];
}
MPI_Scatterv(global, counts, displs, resizedtype,
&local[0][blockColumns], blockRows*blockColumns, MPI_FLOAT,
MASTER, commCart);
答案 0 :(得分:0)
我已经猜到了你实际上希望你的代码如何工作(例如,当前定义本地的方式并将其用作1D,然后2D数组不是内部一致的)。需要注意的要点是:
我没有检查代码的一般性,但它似乎适用于您所说明的特定情况,即2x2分解的4x6矩阵。
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <mpi.h>
#define MASTER 0
#define ROWS 4
#define COLS 6
void main(void)
{
int dim[2], periods[2], NPROWS, NPCOLS, blockRows, blockColumns;
int numtasks, taskid, i, j;
MPI_Comm commCart;
MPI_Init(NULL, NULL);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
dim[0] = dim[1] = sqrt(numtasks);
periods[0] = periods[1] = 0;
MPI_Cart_create(MPI_COMM_WORLD, 2, dim, periods, 1, &commCart);
MPI_Comm_rank(commCart, &taskid);
NPROWS = dim[0];
NPCOLS = dim[1];
blockRows = ROWS / NPROWS;
blockColumns = COLS / NPCOLS;
float* global;
if (taskid == MASTER) {
global = malloc(ROWS * COLS * sizeof (float));
for(i=0;i<ROWS*COLS;i++){
global[i]=i+1;
}
}
float* local;
local = malloc((blockRows+2) * (blockColumns+2) * sizeof (float));
for (i = 0; i < (blockRows+2) * (blockColumns+2); i++) {
local[i] = 0;
}
MPI_Datatype type, resizedtype,column;
int sizes[2] = {ROWS,COLS};
int subsizes[2] = {blockRows,blockColumns};
int starts[2] = {0,0};
MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_C, MPI_FLOAT, \
&type);
MPI_Type_create_resized(type, 0, sizeof(float), &resizedtype);
MPI_Type_commit(&resizedtype);
int *counts = malloc(numtasks*sizeof(int));
int *displs = malloc(numtasks*sizeof(int));
for(i=0;i<numtasks;i++){
counts[i] = 1;
}
int disp = 0;
for(i=0;i<NPROWS;i++){
for(j=0;j<NPCOLS;j++){
disp = i*blockRows*COLS+j*blockColumns;
displs[i*dim[0] + j] = disp;
}
}
MPI_Datatype localtype;
int localsizes[2] = {blockRows+2,blockColumns+2};
int localsubsizes[2] = {blockRows,blockColumns};
int localstarts[2] = {1,1};
MPI_Type_create_subarray(2, localsizes, localsubsizes, localstarts, MPI_ORDER\
_C, MPI_FLOAT, &localtype);
MPI_Type_commit(&localtype);
MPI_Scatterv(global, counts, displs, resizedtype,
&local[0], 1, localtype,
MASTER, commCart);
for (i=0; i < (blockRows+2)*(blockColumns+2); i++)
{
printf("rank %d: local[%d] = %f\n", taskid, i, local[i]);
}
MPI_Finalize();
}
这是输出 - 看起来像你想要的那样:
mpiexec -n 4 ./scatterv
rank 0: local[0] = 0.000000
rank 0: local[1] = 0.000000
rank 0: local[2] = 0.000000
rank 0: local[3] = 0.000000
rank 0: local[4] = 0.000000
rank 0: local[5] = 0.000000
rank 0: local[6] = 1.000000
rank 0: local[7] = 2.000000
rank 0: local[8] = 3.000000
rank 0: local[9] = 0.000000
rank 0: local[10] = 0.000000
rank 0: local[11] = 7.000000
rank 0: local[12] = 8.000000
rank 0: local[13] = 9.000000
rank 0: local[14] = 0.000000
rank 0: local[15] = 0.000000
rank 0: local[16] = 0.000000
rank 0: local[17] = 0.000000
rank 0: local[18] = 0.000000
rank 0: local[19] = 0.000000
rank 1: local[0] = 0.000000
rank 1: local[1] = 0.000000
rank 1: local[2] = 0.000000
rank 1: local[3] = 0.000000
rank 1: local[4] = 0.000000
rank 1: local[5] = 0.000000
rank 1: local[6] = 4.000000
rank 1: local[7] = 5.000000
rank 1: local[8] = 6.000000
rank 1: local[9] = 0.000000
rank 1: local[10] = 0.000000
rank 1: local[11] = 10.000000
rank 1: local[12] = 11.000000
rank 1: local[13] = 12.000000
rank 1: local[14] = 0.000000
rank 1: local[15] = 0.000000
rank 1: local[16] = 0.000000
rank 1: local[17] = 0.000000
rank 1: local[18] = 0.000000
rank 1: local[19] = 0.000000
rank 2: local[0] = 0.000000
rank 2: local[1] = 0.000000
rank 2: local[2] = 0.000000
rank 2: local[3] = 0.000000
rank 2: local[4] = 0.000000
rank 2: local[5] = 0.000000
rank 2: local[6] = 13.000000
rank 2: local[7] = 14.000000
rank 2: local[8] = 15.000000
rank 2: local[9] = 0.000000
rank 2: local[10] = 0.000000
rank 2: local[11] = 19.000000
rank 2: local[12] = 20.000000
rank 2: local[13] = 21.000000
rank 2: local[14] = 0.000000
rank 2: local[15] = 0.000000
rank 2: local[16] = 0.000000
rank 2: local[17] = 0.000000
rank 2: local[18] = 0.000000
rank 2: local[19] = 0.000000
rank 3: local[0] = 0.000000
rank 3: local[1] = 0.000000
rank 3: local[2] = 0.000000
rank 3: local[3] = 0.000000
rank 3: local[4] = 0.000000
rank 3: local[5] = 0.000000
rank 3: local[6] = 16.000000
rank 3: local[7] = 17.000000
rank 3: local[8] = 18.000000
rank 3: local[9] = 0.000000
rank 3: local[10] = 0.000000
rank 3: local[11] = 22.000000
rank 3: local[12] = 23.000000
rank 3: local[13] = 24.000000
rank 3: local[14] = 0.000000
rank 3: local[15] = 0.000000
rank 3: local[16] = 0.000000
rank 3: local[17] = 0.000000
rank 3: local[18] = 0.000000
rank 3: local[19] = 0.000000