Question

我有一个MPI程序正在解决“公制旅行商问题”。当我在Windows上运行它时，它按预期工作，并打印最短的路径。当我在linux上运行它时，我得到一条消息说mpirun注意到信号11退出了进程等级0。当我在StackOverflow上搜索这个问题时，我发现它经常在向MPI的发送/接收函数发送错误的参数时发生，但我查看了我的代码，并且参数看起来很好。我怎么能检查我的错误？

如果有帮助，这是两个代码文件： main.c：

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// forward declaration of tsp_main
int tsp_main(int citiesNum, int xCoord[], int yCoord[], int shortestPath[]);

int main(int argc, char** argv)  
{
    //int citiesNum = 18; //set a lower number for testing
    int citiesNum = 10;
    int xCoord[] = {1, 12, 13, 5, 5, 10, 5, 6, 7, 8, 9, 4, 11, 14, 4,8,4,6};
    int yCoord[] = {7, 2, 3, 3, 5, 6, 7, 8, 9, 4, 11, 12, 13, 14, 5,1,7,33};

int* shortestPath = (int*)malloc(citiesNum * sizeof(int));  
int i, myRank, minPathLen;  
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myRank);

clock_t begin = clock();
minPathLen = tsp_main(citiesNum, xCoord, yCoord, shortestPath);
clock_t end = clock();

if (myRank == 0)
{
    printf("Execution time: %g seconds\n", (double)(end - begin) / CLOCKS_PER_SEC);
    printf("The shortest path, %d long, is:\n", minPathLen);  
    for (i = 0; i < citiesNum; i++)
    {
        // print the city (and its distance from the next city in the path)
        printf("%d (%d) ", shortestPath[i], 
                abs(xCoord[shortestPath[i]] - xCoord[shortestPath[(i + 1) % citiesNum]]) + 
                abs(yCoord[shortestPath[i]] - yCoord[shortestPath[(i + 1) % citiesNum]]) );
    }
    printf("%d\n", shortestPath[0]);
}

MPI_Finalize ();
return 0;  
}

和tsp.c：

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <assert.h>
#include <string.h>
#include <time.h>

//play with this value to change the workload.
#define WORK_LOAD (10)
#define getDistance(a,b) (DistanceArray[a][b])

enum defines {
    MASTER_ID = 0,          // master ID. set to 0 (better safe than sorry)
    DISTRIBUTE_NEW_TASK,    // when the master send a new task to a worker
    ASK_FOR_TASK,           // when a worker asks from the master a new task
    KILL,                   // when the master notifies the worker to die
    SEND_MINIMUM,           // when a process sends its current minimum
    SEND_PATH,              // when a worker updates the master of his best path
};

// initializes the factorials array. the i cell will contain i!. for example, factorials[4] will contain 24.
void initializeFactorials(long long int* factorials, int citiesNum) {
    int i;
    factorials[0] = 1;
    for (i = 1; i < citiesNum; ++i) {
        factorials[i] = i * factorials[i-1];
    }
}
// initializes the two dimensional distance array. Element k,l will contain the distance between city k and city l
void initializeDistanceArray(int** DistanceArray, int citiesNum, int xCoord[], int yCoord[]) {
    int k,l;
    for (k=0; k < citiesNum; ++k) {
        DistanceArray[k] = (int*)malloc(sizeof(int)*citiesNum);
    }
    for (k=0; k < citiesNum; ++k) {
        for (l=0; l < citiesNum; ++l) {
            DistanceArray[k][l] = abs(xCoord[k] - xCoord[l]) + abs(yCoord[k] - yCoord[l]);
        }
    }
}
/* initializes the edge minimum array. Element i contains the minimum weight of i+1 edges between different cities.
   For example, element 0 contains the minimal edge. Element 5 contains the total weight of 6 edges going out from different cities*/
void initializeEdgeMinimum(int* edgeMinimum, int** DistanceArray, int citiesNum) {
    int k, l, sum=0;
    for (k=0; k < citiesNum; ++k) {
        edgeMinimum[k] = INT_MAX;
        for (l=0; l < citiesNum; ++l) {
            if (l == k) continue;
            if (getDistance(k,l) < edgeMinimum[k]) edgeMinimum[k] = getDistance(k,l);
        }
    }
    for (k=0; k < citiesNum-1; ++k) {
        for (l=k+1; l < citiesNum; ++l) {
            if (edgeMinimum[l]>edgeMinimum[k]){
                int temp = edgeMinimum[k];
                edgeMinimum[k] = edgeMinimum[l];
                edgeMinimum[l] = temp;
            }
        }
    }
    for (k=citiesNum-1; k >= 0; --k) {
        sum += edgeMinimum[k];
        edgeMinimum[k] = sum;
    }
}
/* takes an index of a path as an argument, and converts it according to the decision tree (as explained in the external documentation)
   to a path (circle) between cities*/
void convertIndexToPath(long long int index, int citiesNum, int* decisionTree, int* options, int* path, long long int* factorials) {
    int i, j, decision;
    long long int fact;
    for(i = 0; i < citiesNum; ++i) {
        fact = factorials[citiesNum-(i+1)];   
        decision = (int)(index/fact);
        decisionTree[i] = decision;
        index -= fact*((long long int)decision); 
    }
    for(i = 0; i < citiesNum; ++i) {
        options[i] = i+1;
    }
    path[0] = 0;
    for(i = 1; i < citiesNum; ++i) {
        path[i] = options[decisionTree[i]];
        for(j = decisionTree[i]; j < citiesNum-i-1; ++j) {
            options[j] = options[j+1];      
        }
    }
}
/* takes the decision tree of the last path (as explained in the external documentation) and converts it to a path
   ASSUMPTION: can be used ONLY if the last path was NOT pruned */
void convertdDecisionToPath(long long int index, int citiesNum, int* decisionTree, int* options, int* path, long long int* factorials) {
    int i, j;
    for(i = citiesNum-2; i > 0; --i) {
        decisionTree[i] = (decisionTree[i] + 1) % (citiesNum - i);
        if (decisionTree[i] != 0) break;
    }
    for(i = 0; i < citiesNum; ++i) {
        options[i] = i+1;
    }
    path[0] = 0;
    for(i = 1; i < citiesNum; ++i) {
        path[i] = options[decisionTree[i]];
        for(j = decisionTree[i]; j < citiesNum-i-1; ++j) {
            options[j] = options[j+1];      
        }
    }
}
// returns one index before the next iteration of the index of the path that we need to explore right after pruning.
long long int getIndexAfterPrune(long long int index, int citiesVisitedInPath, int citiesNum, int* decisionTree, long long int* factorials) {
    int decision, i;
    long long int fact, nextIndex = 0, indexBackup = index;
    for(i = 0; i < citiesNum; ++i) {
        fact = factorials[citiesNum-(i+1)];   
        decision = (int)(index/fact);
        decisionTree[i] = decision;
        index -= fact*((long long int)decision); 
    }
    for(i = citiesVisitedInPath + 1; i < citiesNum; ++i) {
        decisionTree[i] = 0;
    }
    for(i = 0; i < citiesNum; ++i) {
        nextIndex += (decisionTree[i] * factorials[citiesNum-1-i]);
    }
    nextIndex += factorials[citiesNum - citiesVisitedInPath];
    return nextIndex-1;
}
// returns how many possibilities (paths) there are in a single chunk
long long int getChunkSize(int citiesNum, int workersNum, long long int* factorials) {
    --citiesNum;
    long long int allPossibilities = factorials[citiesNum];
    // empirically setting the chunk size
    long long int chunkSize = WORK_LOAD*(allPossibilities/factorials[citiesNum/2]) / (workersNum);
    if (citiesNum <= 3 || chunkSize == 0) { //the job is small, and one worker can handle it
        return allPossibilities;
    }
    return chunkSize;
}
// returns the number of chunks that we need to handle
long long int getNumberOfChunks(int citiesNum, long long int chunkSize, long long int* factorials) {
    --citiesNum;
    long long int allPossibilities = factorials[citiesNum];
    int lastChunk = 0;
    if (allPossibilities % chunkSize != 0) {
        lastChunk = 1;
    }
    return (allPossibilities/chunkSize) + lastChunk;
}
// returns how many workers should work on the task.
int getNeededWorkers(long long int numberOfChunks, int workersNum) {
    if (workersNum >= numberOfChunks) {
        return (int)numberOfChunks;
    }
    return workersNum;
}
/*  
    Splits the problem into many sub problems and sends tasks to the workers.
    each task contains the start index (the stop index is simply calculated from the chunk size) and the optimal price known so far.
    The master also listens for updates of the optimal price.
    when all the workers finish, the masters send them a request to update him with their optimal solution, and then decides what's the global optimum.
    conventions: variables are in camelCase, consts are in ALL_CAPS, and two dimensional arrays are in PascalCasing
*/
int runMaster(int citiesNum, int xCoord[], int yCoord[], int shortestPath[], int processesNum) {
    // Variables
    int doneWorkers = 0, neededWorkers, gotAnswer, junk, currentMinimum = INT_MAX;
    long long int chunkSize, indexToCheck = 0, bestPathIndex, numberOfChunks;
    MPI_Status status1, status2, status3;
    MPI_Request request1 = MPI_REQUEST_NULL, junkRequest = MPI_REQUEST_NULL;
    // Arrays
    int *decisionTree, *options;
    long long int *factorials, *recieveBuffer, *sendBuffer;
// Dynamic Allocations
decisionTree    = (int*)malloc(citiesNum * sizeof(int));
options         = (int*)malloc(citiesNum * sizeof(int));
factorials      = (long long int*)malloc(citiesNum * sizeof(long long int));
recieveBuffer   = (long long int*)malloc(2 * sizeof(long long int));
sendBuffer      = (long long int*)malloc(2 * sizeof(long long int));

initializeFactorials(factorials, citiesNum);

long long int lastIndex = factorials[citiesNum-1]-1;
chunkSize = getChunkSize(citiesNum, processesNum-1, factorials);
numberOfChunks = getNumberOfChunks(citiesNum, chunkSize, factorials);
neededWorkers = getNeededWorkers(numberOfChunks, processesNum-1);

while (doneWorkers < neededWorkers) {
    //check if a worker wants a new task
    MPI_Probe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status1);
    gotAnswer = 1;
    MPI_Iprobe(MPI_ANY_SOURCE, ASK_FOR_TASK, MPI_COMM_WORLD, &gotAnswer, &status1);
    if (gotAnswer) {
        MPI_Recv(&junk, 0, MPI_INT, MPI_ANY_SOURCE, ASK_FOR_TASK, MPI_COMM_WORLD, &status1); //blocking recieve since we need the request to complete so we'd know who should get the task
        if (indexToCheck <= lastIndex) {
            // the master sends the current minimum, and a new job to the worker
            sendBuffer[0] = currentMinimum;
            sendBuffer[1] = indexToCheck;
            indexToCheck += chunkSize;
            MPI_Irsend(sendBuffer, 2, MPI_LONG_LONG_INT, status1.MPI_SOURCE, DISTRIBUTE_NEW_TASK, MPI_COMM_WORLD, &request1); // we're guaranteed that the worker called IRecv and ready to get a task. no need to block since we can continue doing are own calculations.
            MPI_Request_free(&request1);
        } else { // the master kills the worker
            MPI_Irsend(&junk, 0, MPI_INT, status1.MPI_SOURCE, KILL, MPI_COMM_WORLD, &junkRequest); // we're guaranteed that the worker called IRecv and ready to get a task. no need to block since we can continue doing are own calculations.
            MPI_Request_free(&junkRequest);
        }
        continue;
    }

    gotAnswer = 1;
    MPI_Iprobe(MPI_ANY_SOURCE, SEND_MINIMUM, MPI_COMM_WORLD, &gotAnswer, &status2);
    if(gotAnswer) { // the master recieves a miminal price from one of the workers and decides if it's the global minimum
        MPI_Recv(recieveBuffer, 1, MPI_LONG_LONG_INT, MPI_ANY_SOURCE, SEND_MINIMUM, MPI_COMM_WORLD, &status2); // blocking, since we're going to use the recieve buffer
        currentMinimum = (currentMinimum <= recieveBuffer[0]) ? currentMinimum : (int)recieveBuffer[0];
        continue;
    }

    gotAnswer = 1;
    MPI_Iprobe(MPI_ANY_SOURCE, SEND_PATH, MPI_COMM_WORLD, &gotAnswer, &status3);
    if(gotAnswer) {
        // the master recieves a miminal path and price from one of the workers and decides if it's the global minimum
        MPI_Recv(recieveBuffer, 2, MPI_LONG_LONG_INT, MPI_ANY_SOURCE, SEND_PATH, MPI_COMM_WORLD, &status3); // blocking, since we're going to use the recieve buffer
        ++doneWorkers;
        if(recieveBuffer[0] <= currentMinimum){
            currentMinimum = (int)recieveBuffer[0];
            bestPathIndex = recieveBuffer[1];
        }
    }
} //while

free(factorials); free(decisionTree); free(options); free(recieveBuffer); free(sendBuffer);
convertIndexToPath(bestPathIndex, citiesNum, decisionTree, options, shortestPath, factorials);
return currentMinimum;
}
/*  
    gets tasks from the master and process them until there are no more tasks to handle.
    in each task, we go through all the possibilities in the current chunk, but skip paths that are heavier from the current known optimal path.
    when we discover a new optimal path, we update the rest of the threads if necesssary (first, we check if we got a new optimal weight from them).
    conventions: variables are in camelCase, consts are in ALL_CAPS, and two dimensional arrays are in PascalCasing
*/
void runWorker(int citiesNum, int xCoord[], int yCoord[], int shortestPath[], int processesNum) {
    //Variables
    int sum = 0, gotAnswer = 1, PRUNE_FACTOR = citiesNum - 3, LAST_CITY = citiesNum-1, indexReachedInPath, pruned, sumUntilPruned = 0, IndexUntilPruned = 0, doneWorkers = 0, neededWorkers, junk, myCurrentMinimum = INT_MAX, othersCurrentMinimum = INT_MAX, pid, k;
    long long int numberOfChunks, chunkSize, indexToCheck = 0, startIndex, stopIndex, i, bestPathIndex = -1, lastIndex;
    MPI_Status status1, status2;
    MPI_Request request1 = MPI_REQUEST_NULL, junkRequest = MPI_REQUEST_NULL;
    //Arrays
    int *decisionTree, *edgeMinimum, *myCurrentPath, *options, **DistanceArray;
    long long int *factorials, *recieveBuffer, *sendBuffer;

// Dynamic Allocations
decisionTree    = (int*)malloc(citiesNum * sizeof(int));
edgeMinimum     = (int*)malloc(sizeof(int)*citiesNum);
myCurrentPath   = (int*)malloc(citiesNum * sizeof(int));
options         = (int*)malloc(citiesNum * sizeof(int));
factorials      = (long long int*)malloc(citiesNum * sizeof(long long int));
recieveBuffer   = (long long int*)malloc(2 * sizeof(long long int));
sendBuffer      = (long long int*)malloc(2 * sizeof(long long int));
DistanceArray   = (int**)malloc(sizeof(int*)*citiesNum);

initializeFactorials(factorials, citiesNum);
initializeDistanceArray(DistanceArray, citiesNum, xCoord, yCoord);
initializeEdgeMinimum(edgeMinimum, DistanceArray, citiesNum);

MPI_Comm_rank(MPI_COMM_WORLD, &pid);
lastIndex = factorials[citiesNum-1]-1;
chunkSize = getChunkSize(citiesNum, processesNum-1, factorials);
numberOfChunks = getNumberOfChunks(citiesNum, chunkSize, factorials);
neededWorkers = getNeededWorkers(numberOfChunks, processesNum-1);
if (pid > neededWorkers){ //free memory and exit
    for (k=0; k < citiesNum; ++k) {
        free(DistanceArray[k]);
    }
    free(factorials); free(decisionTree); free(options); free(recieveBuffer); free(sendBuffer); free(edgeMinimum); free(DistanceArray); free(myCurrentPath);
    return;
}

MPI_Irecv(recieveBuffer, 2, MPI_LONG_LONG_INT, MASTER_ID, MPI_ANY_TAG, MPI_COMM_WORLD, &request1); // getting ready to recieve a new task from the master. no need to block
MPI_Ssend(&junk, 0, MPI_INT, MASTER_ID, ASK_FOR_TASK, MPI_COMM_WORLD); // asking the master for a new task. synced & blocking, since we don't have anything else to do until we get a new task

while(1) {
    MPI_Wait(&request1, &status1); //avoid busy-wait

    if (status1.MPI_TAG == DISTRIBUTE_NEW_TASK) {
        // the worker got a new job from the master. recieveBuffer[0] contains the server's currentMinimum. recieveBuffer[1] contains indexToCheck
        othersCurrentMinimum = (recieveBuffer[0] < othersCurrentMinimum) ? (int)recieveBuffer[0] : othersCurrentMinimum; 
        startIndex = recieveBuffer[1]; 
        stopIndex = (startIndex + chunkSize >= lastIndex) ? lastIndex + 1 : startIndex + chunkSize;

        pruned = 1;
        indexReachedInPath = 0;
        sum = 0;

        for(i = startIndex; i < stopIndex; ++i) {
            if (pruned) {   // calculate the current path from the index
                convertIndexToPath(i, citiesNum, decisionTree, options, myCurrentPath, factorials);
            } else {        // calculate the current path from the last path (decision tree)
                convertdDecisionToPath(i, citiesNum, decisionTree, options, myCurrentPath, factorials);
            }
            sum = 0;
            indexReachedInPath = 0;             
            pruned = 0;
            for(; indexReachedInPath < LAST_CITY; ++indexReachedInPath) {
                sum += getDistance(myCurrentPath[indexReachedInPath], myCurrentPath[indexReachedInPath+1]);
                if (indexReachedInPath < PRUNE_FACTOR && sum + edgeMinimum[indexReachedInPath] >= othersCurrentMinimum) {
                    //prune
                    pruned = 1;
                    sum -= getDistance(myCurrentPath[indexReachedInPath], myCurrentPath[indexReachedInPath+1]);
                    if (indexReachedInPath == 0) {
                        i = getIndexAfterPrune(i,1,citiesNum, decisionTree, factorials); 
                    } else {
                        i = getIndexAfterPrune(i,indexReachedInPath,citiesNum, decisionTree, factorials); 
                    }
                    break;
                }
            }

            if(pruned) continue;
            sum += getDistance(myCurrentPath[LAST_CITY], myCurrentPath[0]); //return from the last city to the first

            if(sum < othersCurrentMinimum) {
                myCurrentMinimum = sum;
                bestPathIndex = i;

                //check for a new global minimum
                gotAnswer = 1;
                MPI_Iprobe(MPI_ANY_SOURCE, SEND_MINIMUM, MPI_COMM_WORLD, &gotAnswer, &status2);
                if(gotAnswer) {
                    MPI_Recv(recieveBuffer, 1, MPI_INT, MPI_ANY_TAG, SEND_MINIMUM, MPI_COMM_WORLD, &status2); // blocking, since we're going to use the recieve buffer
                    othersCurrentMinimum = (recieveBuffer[0] < othersCurrentMinimum) ? (int)recieveBuffer[0] : othersCurrentMinimum; 
                }

                if (myCurrentMinimum < othersCurrentMinimum) {
                    othersCurrentMinimum = sum;
                    for (k = 0; k < processesNum; ++k) {
                        if (junk == pid) continue;
                        MPI_Issend(&myCurrentMinimum, 1, MPI_INT, k, SEND_MINIMUM, MPI_COMM_WORLD, &junkRequest); // sending everyone our minimum, copying it to their memory. obviously, no need to block.
                    }
                }
            }
        }

        //send my minimum to the master if it's the global minimum
        //if (myCurrentMinimum <= othersCurrentMinimum) {
        //  MPI_Issend(&myCurrentMinimum, 1, MPI_INT, MASTER_ID, SEND_MINIMUM, MPI_COMM_WORLD, &junkRequest); // sending the master our minimum.
        //}
        // get a new task from the master
        MPI_Irecv(recieveBuffer, 2, MPI_LONG_LONG_INT, MASTER_ID, MPI_ANY_TAG, MPI_COMM_WORLD, &request1); // getting ready to recieve a new task from the master. no need to block
        MPI_Ssend(&junk, 0, MPI_INT, MASTER_ID, ASK_FOR_TASK, MPI_COMM_WORLD); // asking the master for a new task. blocking, since we don't have anything else to do until we get a new task
        continue;
    }

    if(status1.MPI_TAG == KILL) { // free resources, send the master the optimal path and price, and die.
        for (k=0; k < citiesNum; ++k) {
            free(DistanceArray[k]);
        }
        free(factorials); free(decisionTree); free(options); free(recieveBuffer);  free(edgeMinimum); free(DistanceArray); free(myCurrentPath);

        sendBuffer[0] = myCurrentMinimum;
        sendBuffer[1] = bestPathIndex;
        MPI_Ssend(sendBuffer, 2, MPI_LONG_LONG_INT, MASTER_ID, SEND_PATH, MPI_COMM_WORLD); // synced & blocking, since we don't have anything else to do until the master gets the information
        free(sendBuffer);
        return;
    }
} // while
}

// The static parellel algorithm main function. runs the master and the workers.
int tsp_main(int citiesNum, int xCoord[], int yCoord[], int shortestPath[])
{
    int rank, processesNum, result = 0;
    MPI_Comm_size(MPI_COMM_WORLD, &processesNum);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

if (rank == 0) {
    result = runMaster(citiesNum, xCoord, yCoord, shortestPath, processesNum);
} else {
    runWorker(citiesNum, xCoord, yCoord, shortestPath, processesNum);
}
MPI_Barrier(MPI_COMM_WORLD);
return result;
}

Answer 1

我尝试了你的代码，我设法摆脱了错误。我收到一个信号：“浮点异常：整数除以零”。我搜索了发生异常的地方，发现它来自第一个/fact。它是从proc 0抛出的，所以我去了runMaster()。 free(fact)之后有一行。我置换了这些线条，错误消失了。

这种方式可能是正确的：

 convertIndexToPath(bestPathIndex, citiesNum, decisionTree, options, shortestPath, factorials);
 free(factorials); free(decisionTree); free(options); free(recieveBuffer); free(sendBuffer);

然而，我尝试使用2或3个processus的程序，输出结果不同......我很惊讶它之前有效！

再见，弗朗西斯

mpi c mpirun注意到信号11上的进程等级0退出

1 个答案: