我在以下代码中收到MPI_ERR_TRUNCATE:消息截断错误。这是一个测试代码,其中接收方进程从发送方接收两条消息。在第一条消息中,它接收它在第二条消息中接收的整数数的值。在第二条消息中,它接收这些整数。
int main(int argc, char* argv[]){
MPI_Init(&argc,&argv);
int rank,nodes;
int i,j;
MPI_Status stat;
int size,wait;
int msgs = atoi(argv[1]);
MPI_Comm_size(MPI_COMM_WORLD, &nodes);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Request req1;
MPI_Request*** req = (MPI_Request***)malloc(sizeof(MPI_Request**)*msgs);
for(i=0;i<msgs;i++){
req[i] = (MPI_Request**)malloc(sizeof(MPI_Request*)*(nodes-1));
for(j=0;j<nodes-1;j++){
req[i][j] = (MPI_Request*)malloc(sizeof(MPI_Request)*2);
}
}
if(rank==0){
int sent=0;
int data[10];
int pendingThreshold=100;
int sentThreshold=1000;
int completed=0;
size=2;
time_t t;
srand((unsigned) time(&t));
for(i=0;i<msgs;i++){
for(j=1;j<nodes;j++){
size=rand()%9+1;
printf("Sending size = %d at i=%d\n",size,i);
MPI_Isend(&size,1,MPI_INT,
j,0,MPI_COMM_WORLD,&(req[i][j-1][0]));
MPI_Isend (&data[0],size,
MPI_INT,j,1,MPI_COMM_WORLD,&(req[i][j-1][1]));
//Code for ensuring number of non blocking operations
//do not exceed a certain threshold
if(sent==sentThreshold){
while(sent>pendingThreshold){
int k=0;
wait=1;
while(wait){
MPI_Test(&req[completed][k][0], &wait, &stat);
wait = 1-wait;
if(!wait){
MPI_Test(&req[completed][k][1], &wait, &stat);
wait = 1-wait;
if(!wait){
k++;
if(k==nodes-1){
completed++;
sent--;
}
else{
wait=1;
}
}
}
}
}
}
}
sent++;
}
//Code for ensuring all non blocking operations are complete
wait = 1;
printf("Finished\n");
i=completed,j=0;
while(wait){
MPI_Test(&req[i][j][0], &wait, &stat);
wait = 1-wait;
if(!wait){
MPI_Test(&req[i][j][1], &wait, &stat);
wait = 1-wait;
if(!wait){
j++;
if(j==nodes-1){
j=0;
i++;
wait=1;
}
else{
wait=1;
}
if(i==msgs){
wait=0;
}
}
}
}
printf("Finished\n");
}
else{
int data[10];
MPI_Request req2;
for(i=0;i<msgs;i++){
MPI_Irecv (&size,1,MPI_INT,0,0,MPI_COMM_WORLD,&req1);
wait = 1;
while(wait){
MPI_Test(&req1, &wait, &stat);
wait = 1-wait;
}
wait = 1;
while(wait && i){
MPI_Test(&req2, &wait, &stat);
wait = 1-wait;
}
printf("Receiving size=%d at i=%d\n",size,i);
MPI_Irecv (&data[0],size,MPI_INT,0,1,MPI_COMM_WORLD,&req2);
size=0;
}
printf("Finished rank=%d\n",rank);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
}
该程序在多次成功发送 - 接收后接收MPI_ERR_TRUNCATE。当接收器进程收到要在其第二条消息中接收的错误大小的整数时,会发生错误。例如: -
Sending size = 8 at i=1496
Sending size = 2 at i=1497
Sending size = 7 at i=1498
Sending size = 5 at i=1499
Sending size = 5 at i=1500
Sending size = 5 at i=1501
Sending size = 4 at i=1502
Sending size = 9 at i=1503
Sending size = 4 at i=1504
Receiving size=8 at i=1496
Receiving size=2 at i=1497
Receiving size=7 at i=1498
Receiving size=6 at i=1499
Receiving size=6 at i=1500
Receiving size=6 at i=1501
Receiving size=6 at i=1502
Receiving size=6 at i=1503
错误发生在消息号1499处,之后接收的其他消息的大小相同。我已经运行了我的代码,将100000条消息发送到接收方进程。
以下代码使用MPI_Iprobe,即使对于1000000条消息也能正常工作。
int main(int argc, char* argv[]){
MPI_Init(&argc,&argv);
int rank,nodes;
int i,j;
MPI_Status stat;
int size,wait;
int msgs = atoi(argv[1]);
MPI_Comm_size(MPI_COMM_WORLD, &nodes);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Request req1;
MPI_Request** req = (MPI_Request**)malloc(sizeof(MPI_Request*)*msgs);
for(i=0;i<msgs;i++){
req[i] = (MPI_Request*)malloc(sizeof(MPI_Request)*(nodes-1));
}
if(rank==0){
int sent=0;
int data[10];
int pendingThreshold=1000;
int sentThreshold=10000;
int completed=0;
size=2;
time_t t;
srand((unsigned) time(&t));
for(i=0;i<msgs;i++){
for(j=1;j<nodes;j++){
size=rand()%9+1;
printf("Sending size = %d at i=%d\n",size,i);
MPI_Isend (&data[0],size,
MPI_INT,j,1,MPI_COMM_WORLD,&(req[i][j-1]));
if(sent==sentThreshold){
while(sent>pendingThreshold){
int k=0;
wait=1;
while(wait){
MPI_Test(&req[completed][k], &wait, &stat);
wait = 1-wait;
if(!wait){
k++;
if(k==nodes-1){
completed++;
sent--;
}
else{
wait=1;
}
}
}
}
}
}
sent++;
}
wait = 1;
printf("Finished\n");
i=completed,j=0;
while(wait){
MPI_Test(&req[i][j], &wait, &stat);
wait = 1-wait;
if(!wait){
j++;
if(j==nodes-1){
j=0;
i++;
wait=1;
}
else{
wait=1;
}
if(i==msgs){
wait=0;
}
}
}
printf("Finished\n");
}
else{
int data[10];
MPI_Request req2;
for(i=0;i<msgs;i++){
wait = 1;
while(wait){
MPI_Iprobe(0,1,MPI_COMM_WORLD,&wait,&stat);
wait = 1-wait;
}
MPI_Get_count(&stat,MPI_INT,&size);
printf("Receiving size=%d at i=%d\n",size,i);
MPI_Irecv (&data[0],size,MPI_INT,0,1,MPI_COMM_WORLD,&req2);
size=0;
}
printf("Finished rank=%d\n",rank);
}
MPI_Barrier(MPI_COMM_WORLD);
MPI_Finalize();
}