Question

     /**
     * BLOCK_LOW
     * Returns the offset of a local array
     * with regards to block decomposition
     * of a global array.
     *
     * @param  (int) process rank
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) offset of local array in global array
     */
    #define BLOCK_LOW(id, p, n) ((id)*(n)/(p))

    /**
     * BLOCK_HIGH
     * Returns the index immediately after the
     * end of a local array with regards to
     * block decomposition of a global array.
     *
     * @param  (int) process rank
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) offset after end of local array
     */
    #define BLOCK_HIGH(id, p, n) (BLOCK_LOW((id)+1, (p), (n)))

    /**
     * BLOCK_SIZE
     * Returns the size of a local array
     * with regards to block decomposition
     * of a global array.
     *
     * @param  (int) process rank
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) size of local array
     */
    #define BLOCK_SIZE(id, p, n) ((BLOCK_HIGH((id), (p), (n))) - (BLOCK_LOW((id), (p), (n))))

    /**
     * BLOCK_OWNER
     * Returns the rank of the process that
     * handles a certain local array with
     * regards to block decomposition of a
     * global array.
     *
     * @param  (int) index in global array
     * @param  (int) total number of processes
     * @param  (int) size of global array
     * @return (int) rank of process that handles index
     */
    #define BLOCK_OWNER(i, p, n) (((p)*((i)+1)-1)/(n))



    /*Matricefilenames:
      small matrix A.bin of dimension 100 × 50
      small matrix B.bin of dimension 50 × 100
      large matrix A.bin of dimension 1000 × 500
      large matrix B.bin of dimension 500 × 1000

    An MPI program should be implemented such that it can
    • accept two file names at run-time,
    • let process 0 read the A and B matrices from the two data files,
    • let process 0 distribute the pieces of A and B to all the other processes,
    • involve all the processes to carry out the the chosen parallel algorithm
    for matrix multiplication C = A * B ,
    • let process 0 gather, from all the other processes, the different pieces
    of C ,
    • let process 0 write out the entire C matrix to a data file.
    */


    #include <stdio.h>
    #include <stdlib.h>
    #include <mpi.h>
    #include "mpi-utils.c"
    void read_matrix_binaryformat (char*, double***, int*, int*);
    void write_matrix_binaryformat (char*, double**, int, int);
    void create_matrix (double***,int,int);
    void matrix_multiplication (double ***, double ***, double ***,int,int, int);

    int main(int argc, char *argv[]) {
        int id,p; // Process rank and total amount of processes
        int rowsA, colsA, rowsB, colsB; // Matrix dimensions
        double **A; // Matrix A
        double **B; // Matrix B
        double **C; // Result matrix C : AB
        int local_rows; // Local row dimension of the matrix A
        double **local_A; // The local A matrix
        double **local_C;  // The local C matrix

        MPI_Init (&argc, &argv);
        MPI_Comm_rank (MPI_COMM_WORLD, &id);
        MPI_Comm_size (MPI_COMM_WORLD, &p);

        if(argc != 3) {
            if(id == 0) {
                printf("Usage:\n>> %s matrix_A matrix_B\n",argv[0]);
            }       
            MPI_Finalize();
            exit(1);
        }

        if (id == 0) {
            read_matrix_binaryformat (argv[1], &A, &rowsA, &colsA);
            read_matrix_binaryformat (argv[2], &B, &rowsB, &colsB);
        }

        if (p == 1) {
            create_matrix(&C,rowsA,colsB);
            matrix_multiplication (&A,&B,&C,rowsA,colsB,colsA);

            char* filename = "matrix_C.bin";
            write_matrix_binaryformat (filename, C, rowsA, colsB);
            free(A);
            free(B);
            free(C);
            MPI_Finalize();
            return 0;
        }


        // For this assignment we have chosen to bcast the whole matrix B:
        MPI_Bcast (&B, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); 
        MPI_Bcast (&colsA, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast (&colsB, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast (&rowsA, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast (&rowsB, 1, MPI_INT, 0, MPI_COMM_WORLD);

        local_rows = BLOCK_SIZE(id, p, rowsA);


        /*    SCATTER VALUES    */

        int *proc_elements = (int*)malloc(p*sizeof(int)); // amount of elements for each processor
        int *displace = (int*)malloc(p*sizeof(int)); // displacement of elements for each processor
        int i;
        for (i = 0; i<p; i++) {
            proc_elements[i] = BLOCK_SIZE(i, p, rowsA)*colsA;
            displace[i] = BLOCK_LOW(i, p, rowsA)*colsA;
        }

        create_matrix(&local_A,local_rows,colsA);

        MPI_Scatterv(&A[0],&proc_elements[0],&displace[0],MPI_DOUBLE,&local_A[0],
                     local_rows*colsA,MPI_DOUBLE,0,MPI_COMM_WORLD);

        /*    END  SCATTER  VALUES  */  

        create_matrix (&local_C,local_rows,colsB);
        matrix_multiplication (&local_A,&B,&local_C,local_rows,colsB,colsA);

        /*    GATHER VALUES    */

        MPI_Gatherv(&local_C[0], rowsA*colsB, MPI_DOUBLE,&C[0],
              &proc_elements[0],&displace[0],MPI_DOUBLE,0, MPI_COMM_WORLD);

        /*    END  GATHER VALUES  */

        char* filename = "matrix_C.bin";
        write_matrix_binaryformat (filename, C, rowsA, colsB);  

        free (proc_elements);
        free (displace);    
        free (local_A);
        free (local_C);
        free (A);
        free (B);
        free (C);   
        MPI_Finalize ();
        return 0;
    }

    void create_matrix (double ***C,int rows,int cols) {
        *C = (double**)malloc(rows*sizeof(double*));
        (*C)[0] = (double*)malloc(rows*cols*sizeof(double));
        int i;
        for (i=1; i<rows; i++)
            (*C)[i] = (*C)[i-1] + cols;
    }

    void matrix_multiplication (double ***A, double ***B, double ***C, int rowsC,int colsC,int colsA) {
        double sum;
        int i,j,k;
        for (i = 0; i < rowsC; i++) {
            for (j = 0; j < colsC; j++) {
                sum = 0.0;
                for (k = 0; k < colsA; k++) {
                    sum = sum + (*A)[i][k]*(*B)[k][j];
                }
                (*C)[i][j] = sum;
            }
        }
    }

    /* Reads a 2D array from a binary file*/ 
    void read_matrix_binaryformat (char* filename, double*** matrix, int* num_rows, int* num_cols) {
        int i;
        FILE* fp = fopen (filename,"rb");
        fread (num_rows, sizeof(int), 1, fp);
        fread (num_cols, sizeof(int), 1, fp);
        /* storage allocation of the matrix */
        *matrix = (double**)malloc((*num_rows)*sizeof(double*));
        (*matrix)[0] = (double*)malloc((*num_rows)*(*num_cols)*sizeof(double));
        for (i=1; i<(*num_rows); i++)
            (*matrix)[i] = (*matrix)[i-1]+(*num_cols);
        /* read in the entire matrix */
        fread ((*matrix)[0], sizeof(double), (*num_rows)*(*num_cols), fp);
        fclose (fp);
    }

    /* Writes a 2D array in a binary file */
    void write_matrix_binaryformat (char* filename, double** matrix, int num_rows, int num_cols) {
      FILE *fp = fopen (filename,"wb");
      fwrite (&num_rows, sizeof(int), 1, fp);
      fwrite (&num_cols, sizeof(int), 1, fp);
      fwrite (matrix[0], sizeof(double), num_rows*num_cols, fp);
      fclose (fp);
    }

我的任务是对矩阵A和B进行并行矩阵乘法，并将结果收集到矩阵C中。

我这样做是通过将行A中的矩阵A划分，并且每个进程将使用其片段来乘以矩阵B，并从乘法中获取它的片段。然后我将收集流程中的所有部分并将它们组合到矩阵C中。

我已经发布了一个类似的问题，但是这段代码得到了改进，我已经取得了进展但是在散点调用后我仍然遇到了分段错误。

Answer 1

所以我马上就看到了一些问题：

    MPI_Bcast (&B, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD);

在这里，你没有传递指向双打的指针，而是指向指向双精度指针的指针（B被定义为double **B）并且你告诉MPI要关注指针并从那里发送1双。那不行。

你可能会认为你在这里完成的工作是将指针发送到矩阵，所有任务都可以从中读取数组 - 这不起作用。这些进程不共享公共存储空间（这就是为什么MPI被称为分布式存储器编程）并且指针不会到达任何地方。您实际上必须发送矩阵的内容，

    MPI_Bcast (&(B[0][0]), rowsB*colsB, MPI_DOUBLE, 0, MPI_COMM_WORLD);

并且你必须确保其他进程提前为B矩阵正确分配了内存。

其他地方有类似的指针问题：

    MPI_Scatterv(&A[0], ..., &local_A[0]

同样，A是一个指向双精度（double **A）指针的指针，而local_A也是如此，你需要指向MPI指针指向双精度才能使其工作，类似于

    MPI_Scatterv(&(A[0][0]), ..., &(local_A[0][0])

该错误似乎出现在所有通信程序中。

请记住，在MPI中看起来像(buffer, count, TYPE)的任何内容都意味着MPI例程遵循指针buffer并在那里发送count类型的TYPE个数据。 MPI无法跟踪你发送的缓冲区内的指针，因为一般情况下它并不知道它们在那里。它只需要指针(count * sizeof(TYPE))的下一个buffer字节，并进行适当的通信。所以你必须传递一个指向TYPE类型数据流的指针。

说了这么多，如果你把事情缩小了一点，那么与你合作将会容易得多;现在你发布的程序包含很多不相关的I / O内容，这意味着没有人可以运行你的程序，看看在没有先确定矩阵格式然后生成的情况下会发生什么两个矩阵本身。在发布关于源代码的问题时，你真的想要发布（a）一小部分来源，其中（b）重现问题，（c）是完全独立的。

Answer 2

考虑到这是一个扩展的评论，因为Jonathan Dursi已经给出了相当详尽的答案。你的矩阵实际上是以一种奇怪的方式表示的，但至少你遵循了给你的另一个问题的建议，并为它们分配空间作为连续的块，而不是分别为每一行。

鉴于此，您应该替换：

MPI_Scatterv(&A[0],&proc_elements[0],&displace[0],MPI_DOUBLE,&local_A[0],
             local_rows*colsA,MPI_DOUBLE,0,MPI_COMM_WORLD);

与

MPI_Scatterv(A[0],&proc_elements[0],&displace[0],MPI_DOUBLE,local_A[0],
             local_rows*colsA,MPI_DOUBLE,0,MPI_COMM_WORLD);

A[0]已经指向矩阵数据的开头，并且不需要指向它。 local_A[0]以及MPI_Gatherv()调用的参数也是如此。

已经多次说过了 - MPI没有指针追逐，只适用于平缓冲。

我还注意到你的代码中还有一个错误 - 你的矩阵的内存没有被正确释放。您只释放指针数组而不是矩阵数据本身：

free(A);

应该真的成为

free(A[0]); free(A);

scatterv后的分段错误

2 个答案: