MPI_ERR_TRUNCATE:消息被截断

时间:2016-06-01 10:48:29

标签: mpi openmpi

我在以下代码中收到MPI_ERR_TRUNCATE:消息截断错误。这是一个测试代码,其中接收方进程从发送方接收两条消息。在第一条消息中,它接收它在第二条消息中接收的整数数的值。在第二条消息中,它接收这些整数。

int main(int argc, char* argv[]){
    MPI_Init(&argc,&argv);
    int rank,nodes;
    int i,j;
    MPI_Status stat;
    int size,wait;
    int msgs = atoi(argv[1]);
    MPI_Comm_size(MPI_COMM_WORLD, &nodes);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
    MPI_Request req1;
    MPI_Request*** req = (MPI_Request***)malloc(sizeof(MPI_Request**)*msgs);
    for(i=0;i<msgs;i++){
        req[i] = (MPI_Request**)malloc(sizeof(MPI_Request*)*(nodes-1));
        for(j=0;j<nodes-1;j++){
            req[i][j] = (MPI_Request*)malloc(sizeof(MPI_Request)*2);
        }
    }
    if(rank==0){
        int sent=0;
        int data[10];
        int pendingThreshold=100;
        int sentThreshold=1000;
        int completed=0;
        size=2;
        time_t t;
        srand((unsigned) time(&t));
        for(i=0;i<msgs;i++){
            for(j=1;j<nodes;j++){
                size=rand()%9+1;
                printf("Sending size = %d at i=%d\n",size,i);
                MPI_Isend(&size,1,MPI_INT,
                            j,0,MPI_COMM_WORLD,&(req[i][j-1][0]));
                MPI_Isend (&data[0],size,
                                MPI_INT,j,1,MPI_COMM_WORLD,&(req[i][j-1][1]));  
                //Code for ensuring number of non blocking operations 
                //do not exceed a certain threshold
                if(sent==sentThreshold){
                    while(sent>pendingThreshold){
                        int k=0;
                        wait=1;
                        while(wait){
                            MPI_Test(&req[completed][k][0], &wait, &stat);
                            wait = 1-wait;
                            if(!wait){
                                MPI_Test(&req[completed][k][1], &wait, &stat);
                                wait = 1-wait;
                                if(!wait){
                                    k++;
                                    if(k==nodes-1){
                                        completed++;
                                        sent--;
                                    }
                                    else{
                                        wait=1;
                                    }
                                }
                            }
                        }                           
                    }
                }
            }
            sent++;
        }
        //Code for ensuring all non blocking operations are complete
        wait = 1;
        printf("Finished\n");
        i=completed,j=0;
        while(wait){
            MPI_Test(&req[i][j][0], &wait, &stat);
            wait = 1-wait;
            if(!wait){
                MPI_Test(&req[i][j][1], &wait, &stat);
                wait = 1-wait;
                if(!wait){
                    j++;
                    if(j==nodes-1){
                        j=0;
                        i++;
                        wait=1;
                    }
                    else{
                        wait=1;
                    }
                    if(i==msgs){
                        wait=0;
                    }
                }
            }
        }
        printf("Finished\n");
    }
    else{
        int data[10];
        MPI_Request req2;
        for(i=0;i<msgs;i++){
            MPI_Irecv (&size,1,MPI_INT,0,0,MPI_COMM_WORLD,&req1);
            wait = 1;
            while(wait){
                MPI_Test(&req1, &wait, &stat);
                wait = 1-wait;
            }
            wait = 1;
            while(wait && i){
                MPI_Test(&req2, &wait, &stat);
                wait = 1-wait;
            }
            printf("Receiving size=%d at i=%d\n",size,i);
            MPI_Irecv (&data[0],size,MPI_INT,0,1,MPI_COMM_WORLD,&req2);
            size=0;
        }
        printf("Finished rank=%d\n",rank);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
}

该程序在多次成功发送 - 接收后接收MPI_ERR_TRUNCATE。当接收器进程收到要在其第二条消息中接收的错误大小的整数时,会发生错误。例如: -

Sending size = 8 at i=1496
Sending size = 2 at i=1497
Sending size = 7 at i=1498
Sending size = 5 at i=1499
Sending size = 5 at i=1500
Sending size = 5 at i=1501
Sending size = 4 at i=1502
Sending size = 9 at i=1503
Sending size = 4 at i=1504
Receiving size=8 at i=1496
Receiving size=2 at i=1497
Receiving size=7 at i=1498
Receiving size=6 at i=1499
Receiving size=6 at i=1500
Receiving size=6 at i=1501
Receiving size=6 at i=1502
Receiving size=6 at i=1503

错误发生在消息号1499处,之后接收的其他消息的大小相同。我已经运行了我的代码,将100000条消息发送到接收方进程。

以下代码使用MPI_Iprobe,即使对于1000000条消息也能正常工作。

int main(int argc, char* argv[]){
    MPI_Init(&argc,&argv);
    int rank,nodes;
    int i,j;
    MPI_Status stat;
    int size,wait;
    int msgs = atoi(argv[1]);
    MPI_Comm_size(MPI_COMM_WORLD, &nodes);
    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
    MPI_Request req1;
    MPI_Request** req = (MPI_Request**)malloc(sizeof(MPI_Request*)*msgs);
    for(i=0;i<msgs;i++){
        req[i] = (MPI_Request*)malloc(sizeof(MPI_Request)*(nodes-1));
    }
    if(rank==0){
        int sent=0;
        int data[10];
        int pendingThreshold=1000;
        int sentThreshold=10000;
        int completed=0;
        size=2;
        time_t t;
        srand((unsigned) time(&t));
        for(i=0;i<msgs;i++){
            for(j=1;j<nodes;j++){
                size=rand()%9+1;
                printf("Sending size = %d at i=%d\n",size,i);
                MPI_Isend (&data[0],size,
                                MPI_INT,j,1,MPI_COMM_WORLD,&(req[i][j-1]));     
                if(sent==sentThreshold){
                    while(sent>pendingThreshold){
                        int k=0;
                        wait=1;
                        while(wait){
                            MPI_Test(&req[completed][k], &wait, &stat);
                            wait = 1-wait;
                            if(!wait){
                                k++;
                                if(k==nodes-1){
                                    completed++;
                                    sent--;
                                }
                                else{
                                    wait=1;
                                }
                            }
                        }                       
                    }
                }
            }
            sent++;
        }
        wait = 1;
        printf("Finished\n");
        i=completed,j=0;
        while(wait){
            MPI_Test(&req[i][j], &wait, &stat);
            wait = 1-wait;
            if(!wait){
                j++;
                if(j==nodes-1){
                    j=0;
                    i++;
                    wait=1;
                }
                else{
                    wait=1;
                }
                if(i==msgs){
                    wait=0;
                }
            }
        }
        printf("Finished\n");
    }
    else{
        int data[10];
        MPI_Request req2;
        for(i=0;i<msgs;i++){
            wait = 1;
            while(wait){
                MPI_Iprobe(0,1,MPI_COMM_WORLD,&wait,&stat);
                wait = 1-wait;
            }
            MPI_Get_count(&stat,MPI_INT,&size);
            printf("Receiving size=%d at i=%d\n",size,i);
            MPI_Irecv (&data[0],size,MPI_INT,0,1,MPI_COMM_WORLD,&req2);
            size=0;
        }
        printf("Finished rank=%d\n",rank);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Finalize();
}

0 个答案:

没有答案