Question

我正在使用MPI并行地将两个矩阵（2D数组）相乘，方法是将行均匀划分并将它们分散到子进程中。主人也在一大堆行上工作。我知道如何使用MPI_Send / MPI_Recv成功完成此操作，但现在我正在尝试使用MPI_Bcast，并且无法确定何时进行Bcast以及究竟要发送什么。当我在各个点输出完成的矩阵（C）时，似乎并非所有行都被计算/更新，我知道这可能是因为我没有正确指定缓冲区。

代码：

#include <iostream>
#include <stdlib.h>
#include <mpi.h>
#include <stdio.h>
#include <time.h>

using namespace std;


int main(int argc, char *argv[])
{
    int myid, nproc;
     int  Ibuffer[200];         // Integer buffer, use proper size and type
    double Dbuffer[2000];      // Double buffer, use proper size and type
    char Sbuffer[200];         // String Buffer
     int msg_len;
     int i, j, k;

    // initialize the MPI Environment and get the needed Data
    MPI_Init(&argc, &argv);
     MPI_Comm_size(MPI_COMM_WORLD, &nproc);
     MPI_Comm_rank(MPI_COMM_WORLD, &myid);

    // Get the name of processor
    MPI_Get_processor_name(Sbuffer, &msg_len);

    int RowA = 5,
    ColA = 2,
    RowB = ColA,
    ColB = 3,
    RowC = RowA,
    ColC = ColB;

    // Start clock
    double start_time = MPI_Wtime();

    // Initialize matrices
    double **matA = new double*[RowA];
    for (int i = 0; i < RowA; ++i)
        matA[i] = new double[ColA];

    double **matB = new double*[RowB];
    for (int i = 0; i < RowB; ++i)
        matB[i] = new double[ColB];

    double **matC = new double*[RowC];
    for (int i = 0; i < RowC; ++i)
        matC[i] = new double[ColC];



    for (int i = 0; i < RowA; i++)  // MatA
    {
        for (int j = 0; j < ColA; j++)
        {
            matA[i][j] = 2;
        }
    }

    for (int i = 0; i < RowB; i++)  // MatB
    {
        for (int j = 0; j < ColB; j++)
        {
            matB[i][j] = 2;
        }
    }

    for (int i = 0; i < RowC; i++)  // MatC
    {
        for (int j = 0; j < ColC; j++)
        {
            matC[i][j] = 0;
        }
    }



    // All procs compute the chunk size, no need to send separate
    int chunk = RowA / nproc;
    int rest  = RowA % nproc;
    int my_start_row = myid * chunk;        // find my start row
    int my_end_row   = (myid + 1) * chunk;      // find my end row

    // assign rest ot last worker
    if (myid == nproc-1) my_end_row += rest;

    int Dcount = ColA * chunk;    // Data count for A to send to worker
    MPI_Status status;        // Status variable neede for the receive

    if (myid == 0)
    {       
        // Send the rows needed for workers (Don't know if I need this or not)
            //MPI_Bcast(matA, Dcount, MPI_DOUBLE, 0, MPI_COMM_WORLD);

        // Then work on your own part
        for (int i= my_start_row; i < my_end_row; i++)
        {
            for(int j=0; j < ColB; j++)
            {
                for(int k=0; k < RowB; k++)
                {
                    matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
                }
            }
        }

        for (int n=1; n<nproc; n++)
        {
            MPI_Bcast(matC, Dcount, MPI_DOUBLE, n, MPI_COMM_WORLD);
            printf("\n ==++ Master Receive Result by Worker[%d], \n", n); 
        }
    }
    else
    {
        // This is worker, receive the needed info and start working
        //MPI_Bcast(matA, Dcount, MPI_DOUBLE, 0, MPI_COMM_WORLD);

        //printf("\n +++ Worker[%d], recived %d rows from Master \n", myid, myid*chunk);
        cout << "\n === Master sent rows " << myid * chunk << " through " << (myid+1) * chunk << " to process #" << myid << endl;

        // Do the work first 
        for (int i= my_start_row; i < my_end_row; i++)
        {
            for(int j=0; j < ColB; j++)
            {
                for(int k=0; k < RowB; k++)
                {
                    matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]);
                }
            }
        }

        // Send the result to the Master
        MPI_Bcast(matC, Dcount, MPI_DOUBLE, myid, MPI_COMM_WORLD);
        printf("\n --- Worker[%d], Sent Result to Master \n", myid);

    }

    // End clock
    double end_time = MPI_Wtime();

    if (myid == 0) {
        cout << "\nParallel Exec time: " << end_time - start_time << endl;
    }


    MPI_Finalize();



    // Clean up and release the storage
    for (int i=0; i< RowA; i++) 
    {
        delete [] matA[i];
        matA[i] = NULL;
    }
    delete [] matA;
    matA = NULL;
    for (int i=0; i< RowA; i++) 
    {
        delete [] matC[i];
        matC[i] = NULL;
    }
    delete [] matC;
    matC = NULL;


}

如果这个问题太模糊或麻烦我明白了，我只是想知道我是否错误地理解了如何以及何时使用Bcast。

Answer 1

如果我在阅读时没有犯错，这段代码会在每个处理器的A，B和C开头生成三个相同的矩阵，然后计算A乘以B的乘法，但仅针对某些索引。通过这种方式，处理器rank的这种乘法的结果是

C(rank) = A(begin;end) * B

在所考虑的行上，

C(rank) = 0

外部。

所以问题在于MPI_Bcast没有添加矩阵，也没有连接它，它是一个广播函数，并发送缓冲区（这里是矩阵C）根处理器给所有其他人。因此，每个处理器通过Bcast覆盖以前的Bcast。

要连接缓冲区，要使用的函数是MPI_Gather。但是在这里，由于矩阵在开始时尺寸合适，连接在这里不是好主意。

两个选项：

使用执行添加操作并收集数据的函数。您可以看到MPI_Reduce和MPI_Allreduce（但将要执行的操作是x+(nbprocs-1)*0，因此调用此类函数并不是真的有用）
在子大小的矩阵中拆分A和C，然后使用MPI_Gather重新合并结果。

希望它会有所帮助！祝你好运

MPI_Bcast矩阵乘法设置

1 个答案: