Question

我试图使用MPI乘以两个nxn矩阵。第二个矩阵（bb）被广播给所有＆＃34;奴隶＆＃34;然后从第一个矩阵（aa）发送一行来计算产品。然后它将回复发送回主过程并存储在产品矩阵cc中。出于某种原因，我收到了错误：

=   BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
=   EXIT CODE: 11
=   CLEANING UP REMAINING PROCESSES
=   YOU CAN IGNORE THE BELOW CLEANUP MESSAGES

我认为主进程正在接收从进程发送的所有消息，反之亦然，所以我不确定这里发生了什么......有什么想法吗？

主：

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/times.h>
#define min(x, y) ((x)<(y)?(x):(y))
#define MASTER 0

double* gen_matrix(int n, int m);
int mmult(double *c, double *a, int aRows, int aCols, double *b, int bRows, int bCols);

int main(int argc, char* argv[]) {
    int nrows, ncols;
    double *aa;     /* the A matrix */
    double *bb;     /* the B matrix */
    double *cc1;    /* A x B computed */
    double *buffer; /* Row to send to slave for processing */
    double *ans;    /* Computed answer for master */
    int myid, numprocs;
    int i, j, numsent, sender;
    int row, anstype;
    double starttime, endtime;
    MPI_Status status;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    if (argc > 1) {
        nrows = atoi(argv[1]);
        ncols = nrows;
        if (myid == 0) {
            /* Master Code */
            aa = gen_matrix(nrows, ncols);
            bb = gen_matrix(ncols, nrows);
            cc1 = malloc(sizeof(double) * nrows * nrows);
            starttime = MPI_Wtime();
            buffer = (double*)malloc(sizeof(double) * ncols);
            numsent = 0;
            MPI_Bcast(bb, ncols*nrows, MPI_DOUBLE, MASTER, MPI_COMM_WORLD); /*broadcast bb to all slaves*/
            for (i = 0; i < min(numprocs-1, nrows); i++) {                  /*for each process or row*/
                for (j = 0; j < ncols; j++) {                               /*for each column*/
                    buffer[j] = aa[i * ncols + j];                          /*get row of aa*/
                }
                MPI_Send(buffer, ncols, MPI_DOUBLE, i+1, i+1, MPI_COMM_WORLD); /*send row to slave*/
                numsent++;                                                     /*increment number of rows sent*/
            }
            ans = (double*)malloc(sizeof(double) * ncols);
            for (i = 0; i < nrows; i++) {
                MPI_Recv(ans, ncols, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
                         MPI_COMM_WORLD, &status);
                sender = status.MPI_SOURCE;
                anstype = status.MPI_TAG;

                for (i = 0; i < ncols; i++){
                    cc1[(anstype-1) * ncols + i] = ans[i];
                }

                if (numsent < nrows) {
                    for (j = 0; j < ncols; j++) {
                        buffer[j] = aa[numsent*ncols + j];
                    }
                    MPI_Send(buffer, ncols, MPI_DOUBLE, sender, numsent+1,
                             MPI_COMM_WORLD);
                    numsent++;
                } else {
                    MPI_Send(MPI_BOTTOM, 0, MPI_DOUBLE, sender, 0, MPI_COMM_WORLD);
                }
            }

            endtime = MPI_Wtime();
            printf("%f\n",(endtime - starttime));
        } else {
            /* Slave Code */
            buffer = (double*)malloc(sizeof(double) * ncols);
            bb = (double*)malloc(sizeof(double) * ncols*nrows);
            MPI_Bcast(bb, ncols*nrows, MPI_DOUBLE, MASTER, MPI_COMM_WORLD); /*get bb*/
            if (myid <= nrows) {
                while(1) {
                    MPI_Recv(buffer, ncols, MPI_DOUBLE, MASTER, MPI_ANY_TAG, MPI_COMM_WORLD, &status); /*recieve a row of aa*/
                    if (status.MPI_TAG == 0){
                        break;
                    }

                    row = status.MPI_TAG; /*get row number*/
                    ans = (double*)malloc(sizeof(double) * ncols);
                    for (i = 0; i < ncols; i++){
                        ans[i]=0.0;
                    }
                    for (i=0; i<nrows; i++){
                        for (j = 0; j < ncols; j++) { /*for each column*/
                            ans[i] += buffer[j] * bb[j * ncols + i];
                        }
                    }
                    MPI_Send(ans, ncols, MPI_DOUBLE, MASTER, row, MPI_COMM_WORLD);
                }
            }
        } /*end slave code*/
    } else {
        fprintf(stderr, "Usage matrix_times_vector <size>\n");
    }
    MPI_Finalize();
    return 0;
}

Answer 1

此错误消息通常表示至少一个MPI进程崩溃，随后整个MPI作业中止。它可能是由任何类型的错误引起的，但大多数情况下，它是由错误的内存访问引起的分段错误。

我没有密切关注代码，所以我不知道逻辑是否有效，但我能说的是这条线有问题：

MPI_Recv(&ans, nrows, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
         MPI_COMM_WORLD, &status);

确实，这里有两个问题：

&ans是**double，这不是你想要的，我猜你想要ans
ans尚未分配，因此无法用作接收缓冲区

首先尝试解决此问题，看看会发生什么。

编辑：在您的新代码上分配ans，如下所示：

ans = (double*)malloc(sizeof(double) * ncols);

然后你像这样初始化它：

for (i = 0; i < nrows; i++) {
    ans[i]=0.0;
}

并像这样使用它：

MPI_Send(ans, nrows, MPI_DOUBLE, MASTER, row, MPI_COMM_WORLD);

或

MPI_Recv(ans, nrows, MPI_DOUBLE, MPI_ANY_SOURCE, MPI_ANY_TAG,
         MPI_COMM_WORLD, &status);

这不是一致的：是ans的大小ncols还是nrows？

您的新错误消息是什么？

MPI矩阵乘法，过程不清理

1 个答案: