mpi scatterv和gatherv

时间:2018-04-04 07:05:10

标签: mpi

当我使用命令#34运行我的代码时; mpirun -hosts o251-12,o251-13 ./matrixmult"我键入ijk R 4,它给出了错误。更糟糕的是,如果我再次运行,错误有时会改变。 如果我使用1或2而不是4,它运行良好,但如果我使用大数字,它会给出错误。我试过改变我的代码,但是没有用。

这是我的代码

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>
#include <string.h>
void putValue(int *A, int *B, char *flag, int n);
void ijk(int *A, int *B, int *result, int n, int tmpAn);
void ikj(int *A, int *B, int *result, int n, int tmpAn);
void kij(int *A, int *B, int *result, int n, int tmpAn);

int main(){
int *A;
int *B;
int *result;
int n;
char flag[2];
char form[4];

int my_rank;
int comm_sz;

double time1;
double time2;
double time;

int i;
int j;

int *tmpA;
int *tmpResult;
int *sc;
int *displs;
int alpha;
int d;

MPI_Init(NULL,NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD,&comm_sz);

MPI_Barrier(MPI_COMM_WORLD);
if(my_rank==0){
    time1=MPI_Wtime();
}

if(my_rank==0){
    scanf("%s", form);
    scanf("%s", flag);
    scanf("%d", &n);

    A = (int*)malloc(n * n * sizeof(int));
    B = (int*)malloc(n * n * sizeof(int));
    result = (int*)malloc(n * n * sizeof(int));

    putValue(A,B,flag,n);

    printf("running on %d processors\n", comm_sz);
}
if(my_rank){
    B = (int*)malloc(n * n * sizeof(int));
}

MPI_Bcast(&n, 1, MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(B,n*n,MPI_INT,0,MPI_COMM_WORLD);
MPI_Bcast(&form, 4, MPI_CHAR,0,MPI_COMM_WORLD);

sc = (int*)malloc(comm_sz * sizeof(int));
displs = (int*)malloc(comm_sz * sizeof(int));
alpha = n%comm_sz;
d=0;
for(i=0;i<comm_sz;i++){
    sc[i] = n/comm_sz;
    if(i<alpha){
        sc[i] = sc[i]+1;
    }
    sc[i]=sc[i]*n;
    displs[i] = d;
    d+=sc[i];
}

tmpA = (int*)malloc(n * sc[my_rank] * sizeof(int));
tmpResult = (int*)malloc(n * sc[my_rank] * sizeof(int));
for(i=0; i<sc[my_rank]; i++){
    tmpResult[i]=0;
}

MPI_Scatterv(A,sc,displs,MPI_INT,tmpA,sc[my_rank],MPI_INT,0,MPI_COMM_WORLD);
if(strcmp(form,"ijk")==0){
    ijk(tmpA,B,tmpResult,n,sc[my_rank]);
}
else if(strcmp(form,"ikj")==0){
    ikj(tmpA,B,tmpResult,n,sc[my_rank]);
}
else if(strcmp(form,"kij")==0){
    kij(tmpA,B,tmpResult,n,sc[my_rank]);
}
MPI_Gatherv(tmpResult,sc[my_rank],MPI_INT,result,sc,displs,MPI_INT,0,MPI_COMM_WORLD);

if(my_rank==0){
    time2=MPI_Wtime();
    time=time2-time1;
    printf("elapsed time = %.6e secondes\n",time);
    if(strcmp(flag,"I")==0){
        for(i=0;i<n;i++){
            for(j=0;j<n;j++){
                printf("%d ",result[i*n+j]);
            }
            printf("\n");
        }
    }
}
MPI_Finalize();
return 0;
}

void putValue(int *A, int *B, char *flag, int n){
int i,j;
srand((unsigned)time(NULL));
if(strcmp(flag,"R")==0){
    for(i=0; i<n; i++){
        for(j=0; j<n; j++){
            A[i*n+j] = (int)rand()%101;
            B[i*n+j] = (int)rand()%101;
            printf("%d\n",A[i*n+j]);
            printf("%d\n",B[i*n+j]);
        }
    }
}
else if(strcmp(flag,"I")==0){
    for(i=0; i<n; i++){
        for(j=0; j<n; j++){
            int x;
            scanf("%d", &x);
            A[i*n+j]=x;
        }
    }
    for(i=0; i<n; i++){
        for(j=0; j<n; j++){
            int x;
            scanf("%d", &x);
            B[i*n+j]=x;
        }
    }
}
}

void ijk(int *A, int *B, int *result, int n, int tmpAn){
int i,j,k;
for(i=0;i<(tmpAn/n);i++){
    for(j=0;j<n;j++){
        for(k=0;k<n;k++){
            result[i*n+j] += (A[i*n+k] * B[k*n+j]);
        }
    }
}
}

void ikj(int *A, int *B, int *result, int n, int tmpAn){
int i,j,k;
for(i=0;i<(tmpAn/n);i++){
    for(k=0;k<n;k++){
        for(j=0;j<n;j++){
            result[i*n+j] += (A[i*n+k] * B[k*n+j]);
        }
    }
}
}

void kij(int *A, int *B, int *result, int n, int tmpAn){
int i,j,k;
for(k=0;k<n;k++){
    for(i=0;i<(tmpAn/n);i++){
        for(j=0;j<n;j++){
            result[i*n+j] += (A[i*n+k] * B[k*n+j]);
        }
    }
}
}

有时错误是

rank = 3, revents = 25, state = 8
Assertion failed in file ../../src/mpid/ch3/channels/nemesis/netmod/tcp/socksm.c at line 2988: (it_plfd->revents & POLLERR) == 0
internal ABORT - process 1

有时错误是

Fatal error in PMPI_Gatherv: Unknown error class, error stack:
PMPI_Gatherv(1001)....................: MPI_Gatherv failed(sbuf=0x231f140, 
scount=4, MPI_INT, rbuf=0x231f060, rcnts=0x231f0b0, displs=0x231f0d0, MPI_INT, root=0, MPI_COMM_WORLD) failed
MPIR_Gatherv_impl(545)................: fail failed
I_MPIR_Gatherv_intra(617).............: Failure during collective
I_MPIR_Gatherv_intra(590).............: fail failed
MPIR_Gatherv_advanced(720)............: fail failed
MPIDU_Complete_posted_with_error(1710): Process failed
====================================================
=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
=   PID 17870 RUNNING AT o251-13
=   EXIT CODE: 134
=   CLEANING UP REMAINING PROCESSES
=   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES

1 个答案:

答案 0 :(得分:0)

此行由根进程

执行
scanf("%d", &n);

此行由广播n之前的其余部分执行:

B = (int*)malloc(n * n * sizeof(int));

在广播之后移动那个alloc语句