MPI程序中的分段错误

时间:2017-11-05 18:59:52

标签: c parallel-processing mpi

以下是一个程序,旨在对两个大小为128 * 128的矩阵进行求和,将任务拆分为8进程,因此每个进程对16行矩阵求和。

int main(int argc, char** argv)
{
    int rank;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int **matrixA;
    int **matrixB;
    int **resultMatrix = malloc(ROWS * sizeof(int *));

    if (rank == 0) {
       matrixA = generateMatrix();
       matrixB = generateMatrix();
       for (i = 0; i < COLS; i++)
          resultMatrix[i] = malloc(COLS * sizeof(int));
    }

    int **auxA = malloc(16 * sizeof(int *));
    int **auxB = malloc(16 * sizeof(int *));
    int **auxC = malloc(16 * sizeof(int *));

    int i;
    int row, col;

    for (i = 0; i < 16; i++)
    {
       auxA[i] = malloc(COLS * sizeof(int));
       auxB[i] = malloc(COLS * sizeof(int));
       auxC[i] = malloc(COLS * sizeof(int));
    }

    MPI_Scatter(&(matrixA[0][0]), 16*COLS, MPI_INT, &(auxA[0][0]), 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Scatter(&(matrixB[0][0]), 16*COLS, MPI_INT, &(auxB[0][0]), 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);

    char hostname[HOST_NAME_MAX];
    if (! gethostname(hostname, sizeof hostname) == 0)
        perror("gethostname");

    for (row = 0; row < 16; row++)
    {
       for (col = 0; col < COLS; col++)
       {
           auxC[row][col] = auxA[row][col] + auxB[row][col];
       }
    }

    printf("Process Node %s %d\n done",hostname, rank);

    MPI_Gather(&auxC[0][0], 16*COLS, MPI_INT, &(resultMatrix[0][0]), 1, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
       printMatrix(resultMatrix);
    }

    MPI_Finalize();
}   

获取以下错误消息

[xxx115:12933] *** Process received signal ***
[xxx115:12933] Signal: Segmentation fault (11)
[xxx115:12933] Signal code: Address not mapped (1)
[xxx115:12933] Failing at address: 0x2
[xxx115:12934] *** Process received signal ***
[xxx115:12934] Signal: Segmentation fault (11)
[xxx115:12934] Signal code: Address not mapped (1)
[xxx115:12934] Failing at address: 0x2
[xxx115:12936] *** Process received signal ***
[xxx115:12936] Signal: Segmentation fault (11)
[xxx115:12936] Signal code: Address not mapped (1)
[xxx115:12936] Failing at address: 0x2
[xxx115:12933] [ 0] /lib64/libpthread.so.0[0x32ff00f7e0]
[xxx115:12933] [ 1] ./hw5[0x400bd4]
[xxx115:12933] [ 2] /lib64/libc.so.6(__libc_start_main+0xfd)[0x32fe41ed1d]
[xxx115:12933] [ 3] ./hw5[0x400879]
[xxx115:12936] [ 0] /lib64/libpthread.so.0[0x32ff00f7e0]
[xxx115:12936] [ 1] ./hw5[0x400bd4]
[xxx115:12936] [ 2] [wsu115:12933] *** End of error message ***

正如我从thisthis等其他问题中观察到的那样,当数组尝试从其边界访问内存时,会出现分段错误。

但是,我不是程序的哪个部分导致此错误。我该怎么做才能解决问题。

修改

在完成评论之后,我意识到之前的程序杂乱无章。这是一个简单的版本,没有任何错误,但仍未提供预期的输出。

int **generateMatrix() 
{
    int **matrix = (int **)malloc(ROWS * sizeof(int *));
    int i;
    for (i = 0; i < ROWS; i++)
         matrix[i] = (int *)malloc(COLS * sizeof(int));

    int row, col;
    for (row = 0; row < ROWS; row++) 
    {
        for (col = 0; col < COLS; col++) 
        {
            matrix[row][col] = col; 
        }
    }
    return matrix;
}

int main(int argc, char** argv)
{
    int rank, world_size;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    MPI_Comm_size(MPI_COMM_WORLD, &world_size);

    int **matrixA;
    int **matrixB;
    int resultMatrix[128][128];

    int auxA[16][128];
    int auxB[16][128];
    int auxC[16][128];

    if (rank == 0) {
        matrixA = generateMatrix();
        matrixB = generateMatrix();
    }

    MPI_Scatter(matrixA, 16*COLS, MPI_INT, auxA, 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Scatter(matrixB, 16*COLS, MPI_INT, auxB, 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);

    char hostname[HOST_NAME_MAX];
    if (! gethostname(hostname, sizeof hostname) == 0)
      perror("gethostname");

    int row, col; 
    for (row = 0; row < 16; row++)
    {
        for (col = 0; col < COLS; col++)
        {
            auxC[row][col] = auxA[row][col] + auxB[row][col];
        }
    }

    printf("Process Node %s %d done\n",hostname, rank);

    MPI_Gather(auxC, 16*COLS, MPI_INT, resultMatrix, 16*COLS, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        printMatrix(resultMatrix);
    }

    MPI_Finalize();
    return 0;
}   

0 个答案:

没有答案