MPI使用LU分解对块系统/矩阵进行边界化

时间:2013-09-11 14:18:52

标签: c matrix block mpi diagonal

我应该解决一个有条件的块系统,这里有两个版本,正常工作的串行版本和不兼容的并行版本(带MPI)我不知道为什么......有人可以帮助我找出原因?

  

=============================================== ====================================   =严重终止您的申请流程之一   =退出代码:11   =清理剩余的过程   =你可以忽略以下的清理消息   ================================================== =================================您的应用程序终止退出字符串:分段错误   (信号11)这通常是指您的应用程序存在问题。   有关调试建议,请参阅常见问题解答页面

串行版:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>

#include "res.h"

// Numero di blocchi
#define N 128

// Dimensione di ogni Blocco
#define BS 256

// Leading Dimension
#define LD BS * BS

/*
 *
 */
int main (int argc, char ** argv) {

    // Partenza del timer
    clock_t t_end, t_start = clock();

    double time;
    size_t ii, jj;

    /* 
     * Alloco tutta la memoria necessaria per poter svolgere i calcoli e 
     * popolo le matrici (in ordine) A, B, C, As, bi, bs 
     */
    double *A = calloc (LD * N, sizeof(*A));
    double *B = calloc (LD * N, sizeof(*B));
    double *C = calloc (LD * N, sizeof(*C));
    double *As = calloc (LD, sizeof(*As));

    double *x = calloc (BS * N, sizeof(*x));    
    double *xs = calloc (BS, sizeof(*xs));
    double *B_tmp = calloc (BS * N, sizeof(*x));

    double *b = calloc (BS * N, sizeof(*b));
    double *bs = calloc (BS, sizeof(*bs));

    double *Y = calloc (LD * N, sizeof(*Y));
    double *y = calloc (BS * N, sizeof(*y));
    double *D = calloc (LD * N, sizeof(*D));
    double *d = calloc (BS * N, sizeof(*d));

    double *A_cap = calloc (LD, sizeof(*A_cap));
    double *A_tmp = calloc (LD, sizeof(*A_tmp));
    double *b_cap = calloc (BS, sizeof(*b_cap));
    double *b_tmp = calloc (BS, sizeof(*b_tmp));

    double *xi = calloc (BS * N, sizeof(*xi));
    double *xs_tmp = calloc (BS, sizeof(*xs_tmp));

    for (ii = 0; ii < N; ii++){
        genera_Ai(&A[LD*ii], BS, ii+1);
        genera_Bi(&B[LD*ii], BS, ii+1);
        genera_Bi(&C[LD*ii], BS, ii+1);
    }

    genera_Ai(As, BS, N+2);

    for (ii = 0; ii < N; ii++) {
        for(jj = 0; jj < BS; jj++) 
            b[BS*ii + jj] = 1;
    }

    for (ii = 0; ii < BS; ii++) 
        bs[ii] = 1;

    /* Step #1 
     * Calcolo la fattorizzazione LU di tutte le matrici A0,..., AN.
     * Ogni matrice Ai ha dimensione BS * BS (256 x 256) overo LD.
     */
    for (ii = 0; ii < N; ii++) 
        LU (&A[LD*ii], BS); 
    /* Troviamo le matrici Di = Ai^-1 * Bi, di = Ai^-1 * bi con la 
    fattorizzazione LU e eliminazione in avanti e sostituzione all'indietro */
    for (ii = 0; ii < N; ii++) {
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);
    }
    for (ii = 0; ii < N; ii++) {
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);
    }

    /* Step #2
     * Calcolo A_cap e b_cap, eseguendo prima il prodotto matrice matrice e poi 
     * la sottrazione dalla matrice As | bs (rispettivamente per A_cap e b_cap).
     *
     *  A_tmp = A_tmp + (Ci * Di) ---> A_cap = As - A_tmp
     *  b_tmp = b_tmp + (Ci * di) ---> b_cap = bs - b_tmp
     */
    // #2.1 - Prodotto matrice matrice, ovvero A_tmp = A_tmp + (Ci * Di) 
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, A_tmp, BS);
    // #2.2 - Calcolo di A_cap, ovvero A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    // #2.3 - Prodotto matrice matrice, ovvero b_tmp = b_tmp + (Ci * di)
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, b_tmp, 1);
    // #2.4 - Calcolo di b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];


    /* Step #3
     * Risolvo il sistema A_cap * xs = b_cap, dove:
     *      
     * A_cap = (As - Ci * Ai^-1 * Bi)
     * b_cap = (bs - Ci * Ai^-1 * bi)
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    /*
     * Step #4
     * 
     * Calcolo le restanti componenti del vettore soluzione x risolvendo
     * N sistemi lineari Ai * xi = bi - Bi * xs.
     */
    for (ii = 0; ii < N; ii++) {
        // #4.1 Calcolo Bi*xs = B_tmp
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &B_tmp[BS*ii], 1);

        // #4.2 Calcolo bi-B_tmp = B_tmp
        for (jj = 0; jj < BS; jj++)
            B_tmp[BS*ii + jj] = b[BS*ii + jj] - B_tmp[BS*ii + jj];

        // #4.3 Risolvo il sistema Ai*xi = x
        elim_avanti(&A[LD*ii], BS, &B_tmp[BS*ii], 1, xs_tmp);
        sost_indietro(&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
    }

    // Salvo tutte le soluzioni nel vettore finale xi 
    for (ii = 0; ii <= N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Fermo il timer
    t_end = clock();

    time = (t_end - t_start) / (double) CLOCKS_PER_SEC;
    printf("Time is: %f\n", time);

    free(A); free(B); free(C); free(As);
    free(x); free(xs); free(b); free(bs);
    free(Y); free(y);  free(D); free(d);
    free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
    free(xs_tmp); free(xi);

    return  0;
}

并行版本:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

#include "res.h"

// Blocks
#define N 128

// Block Size
#define BS 256

// Leading Dimension
#define LD BS * BS

// The master node
#define MASTER 0

// Globals

/*
 *
 */
int main (int argc, char ** argv) {

    size_t ii, jj;
    int my_rank, num_nodes;
    double t_start, t_end;
    double *A, *A_recv, *B, *B_recv, *C, *C_recv;
    double *As, *x, *xs, *b, *bs;
    double *Y, *y, *D, *d, *D_tmp, *d_tmp;
    double *A_cap, *A_tmp, *b_cap, *b_tmp;
    double *xi, *xs_tmp;    
    double *x_res;

    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);

    /**************************** From Master *******************************/
    if (my_rank == MASTER) {

        t_start = MPI_Wtime();

        A = calloc (LD * N, sizeof(*A));
        B = calloc (LD * N, sizeof(*B));
        C = calloc (LD * N, sizeof(*C));
        As = calloc (LD, sizeof(*As));
        x = calloc (BS * N, sizeof(*x));
        xs = calloc (BS, sizeof(*xs));
        b = calloc (BS * N, sizeof(*b));
        bs = calloc (BS, sizeof(*bs));
        Y = calloc (LD * N, sizeof(*Y));
        y = calloc (BS * N, sizeof(*y));
        D = calloc (LD * N, sizeof(*D));
        d = calloc (BS * N, sizeof(*d));
        A_cap = calloc (LD, sizeof(*A_cap));
        A_tmp = calloc (LD, sizeof(*A_tmp));
        b_cap = calloc (BS, sizeof(*b_cap));
        b_tmp = calloc (BS, sizeof(*b_tmp));
        d_tmp = calloc (BS, sizeof(*d_tmp));
        xi = calloc (BS * N, sizeof(*xi));
        xs_tmp = calloc (BS, sizeof(*xs_tmp));
        x_res = calloc (BS, sizeof(*x_res));

        // Initialize the matrices
        for (ii = 0; ii < N; ii++){
            genera_Ai(&A[LD*ii], BS, ii+1);
            genera_Bi(&B[LD*ii], BS, ii+1);
            genera_Bi(&C[LD*ii], BS, ii+1);
        }

        genera_Ai(As, BS, N+1);

        for (ii = 0; ii < N; ii++) {
            for(jj = 0; jj < BS; jj++) 
                b[BS*ii + jj] = 1;
        }

        for (ii = 0; ii < BS; ii++) 
            bs[ii] = 1;

        // Each node will receive (LD * N / num_nodes) entries
        A_recv = calloc (LD * N / num_nodes, sizeof(*A_recv));
        B_recv = calloc (LD * N / num_nodes, sizeof(*B_recv));
        C_recv = calloc (LD * N / num_nodes, sizeof(*C_recv));

        // Send A matrix
        MPI_Scatter (A, LD * N, MPI_DOUBLE, 
            A_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);

        // Send B matrix     
        MPI_Scatter (B, LD * N, MPI_DOUBLE, 
            B_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);

        // Send C matrix
        MPI_Scatter (C, LD * N, MPI_DOUBLE, 
            C_recv, LD * N, MPI_DOUBLE, 
            MASTER, MPI_COMM_WORLD);
    }

    /**************************** From Workers *******************************/
    /* Step #1 
     * Each processor performs the LU decomposition of their matrices Ai
         * generating the two matrices Li and Ui.
     */
    for (ii = 0; ii < N / num_nodes; ii++) 
        LU (&A[LD*ii], BS);

    /* Step #2 
     * Each processor calculates the term di = Ci * Ai^-1 * bi for each matrix
         * For it meant taking advantage of the LU decomposition calculated in step
         * Number 1. Then it calculates the sum.
     */
    // 2.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);

    // 2.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);

     // #2.3 - matrix multiplication, d_tmp = d_tmp + (Ci * di)
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);

    MPI_Barrier (MPI_COMM_WORLD);

    /* Step #3
     * Through a communication type "Reduce" with sum over the terms "di" the
         * Master processor obtains the sum Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
         * This calculates b_cap.
     */
    // 3.1 - sum d_tmp
    MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #3.2 - Calc b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];


    /* Step #4
     * Each processor calcs the term Di = Ci * Ai^-1 * Bi by the resolution of
     * n linear systems Ai * xj = Bij
     */
    // 4.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);

    // 4.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);

    // #4.3 - Prodotto matrice matrice, ovvero D_tmp = D_tmp + (Ci * Di) 
    for (ii = 0; ii < N; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);

    /* Step #5
     * Through a communication type "Reduce" with sum over the terms 'di', the
         * Master processor obtains the sum There Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
         * This calculates A_cap.
     */
    // 5.1 - sum all D_tmp
    MPI_Reduce (&D_tmp, &A_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #5.2 - calc A_cap, that is A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    /* Step #6 
     * The "master" processor resolve the system A_cap * xs = b_cap
     * to dermine xs.
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    MPI_Barrier (MPI_COMM_WORLD);

    /* Step #7
     * The solution xs is communicated to all processors via a broadcast.
     */
    if (my_rank == MASTER)
        MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);

    /* Step #8
     * each processor j-esimo resolve the linear systems Ai*xi = bi - Bi*xs
     */
    for (ii = 0; ii < N / num_nodes; ii++) {
        // #8.1 Calc Bi*xs = x
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);

        // #8.2 Calc bi-x = x
        for (jj = 0; jj < BS; jj++)
            x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];

        // #8.3 Resolve the system Ai*xi = x
        elim_avanti   (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
        sost_indietro (&A[LD*ii], BS, xs_tmp,    1, &x[BS*ii]);
    }

    /* Step #9
     * The components "xi" are combined into a single vector via a
     * Gather type communication. Joined together to form the solution "xs"
     * To the initial problem.
     */
    MPI_Gather (xi, BS, MPI_DOUBLE,
         x_res, BS, MPI_DOUBLE,
             MASTER, MPI_COMM_WORLD);

    for (ii = 0; ii < N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Wait all nodes before to print the result
    MPI_Barrier (MPI_COMM_WORLD);

    // Stop timer
    if (my_rank == MASTER) {
        t_end = MPI_Wtime();
        printf("Time is %f\n", t_end - t_start);
    }

    free(A); free(B); free(C); free(As);
    free(x); free(xs); free(b); free(bs);
    free(Y); free(y); free(D); free(d);
    free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
    free(xs_tmp); free(xi);

    MPI_Finalize();
    return  0;
}

这是res.h http://bpaste.net/show/yAQCCwFsZlQCCiw7GKC9/这是正确的,由老师提供。

我很抱歉我的英文不好和一些意大利语评论,代码应该清楚,我的目标是将代码的串行部分与MPI并行化,处理器的数量总是4的倍数。提前感谢

2 个答案:

答案 0 :(得分:1)

你在其他地方张贴了这个:

Fatal error in PMPI_Reduce: Internal MPI error!, error stack:
PMPI_Reduce(1217)..............: MPI_Reduce(sbuf=0x7fffffffda88, rbuf=0x7fffffffda78, count=8388608, MPI_DOUBLE, MPI_SUM, root=0, MPI_COMM_WORLD) failed
MPIR_Reduce_impl(1029).........: 
MPIR_Reduce_intra(825).........: 
MPIR_Reduce_redscat_gather(300): 
MPIR_Localcopy(357)............: memcpy arguments alias each other, dst=0x7fffffffda78 src=0x7fffffffda88 len=67108864
[Inferior 1 (process 4210) exited with code 01]

正如错误消息所示,您已将MPI_Reduce的参数别名化。这是因为您传递指针的地址而不是值,这是向MPI提供数据缓冲区的正确方法,至少在您动态分配缓冲区时。

我认为改变这个

MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

MPI_Reduce (d_tmp, b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

将解决问题。如果您真的喜欢&amp;,您可以执行以下操作:

MPI_Reduce (&d_tmp[0], &b_tmp[0], LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

但这没有意义。

如果在请求void *参数时使用&amp; ptr而不是ptr,您可能会发现其他MPI调用是错误的。

答案 1 :(得分:0)

我找到了解决方案。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>

#include "res.h"

// Number of Blocks
#define N 128

// Block Size
#define BS 256

// Leading Dimension
#define LD BS * BS

// The master node
#define MASTER 0

/*
 *
 */
int main (int argc, char ** argv) {

    size_t ii, jj;
    int my_rank, num_nodes;
    double t_start, t_end;
    double *A, *A_data, *B, *B_data, *C, *C_data;
    double *As, *x, *xs, *b, *bs;
    double *Y, *y, *D, *d, *D_tmp, *d_tmp;
    double *A_cap, *A_tmp, *b_cap, *b_tmp;
    double *xi, *xs_tmp;    
    double *x_res;

    MPI_Status status;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);

    A = calloc (LD * N / num_nodes, sizeof(*A));
    B = calloc (LD * N / num_nodes, sizeof(*B));
    C = calloc (LD * N / num_nodes, sizeof(*C));
    Y = calloc (LD * N, sizeof(*Y));
    y = calloc (BS * N, sizeof(*y));
    D = calloc (LD * N, sizeof(*D));
    d = calloc (BS * N, sizeof(*d));
    b = calloc (BS * N, sizeof(*b));
    As = calloc (LD, sizeof(*As));
    x = calloc (BS * N, sizeof(*x));
    xs = calloc (BS, sizeof(*xs));
    bs = calloc (BS, sizeof(*bs));;
    A_cap = calloc (LD, sizeof(*A_cap));
    A_tmp = calloc (LD, sizeof(*A_tmp));
    b_cap = calloc (BS, sizeof(*b_cap));
    b_tmp = calloc (BS, sizeof(*b_tmp));
    d_tmp = calloc (BS, sizeof(*d_tmp));
    D_tmp = calloc (LD, sizeof(*D_tmp));
    xi = calloc (BS * N, sizeof(*xi));
    xs_tmp = calloc (BS, sizeof(*xs_tmp));
    x_res = calloc (BS * num_nodes, sizeof(*x_res));    

    if (my_rank == MASTER) {

        t_start = MPI_Wtime();

        A_data = calloc (LD * N, sizeof(*A_data));
        B_data = calloc (LD * N, sizeof(*B_data));
        C_data = calloc (LD * N, sizeof(*C_data));

        // Initialize the matrices
        for (ii = 0; ii < N; ii++){
            genera_Ai(&A_data[LD*ii], BS, ii+1);
            genera_Bi(&B_data[LD*ii], BS, ii+1);
            genera_Bi(&C_data[LD*ii], BS, ii+1);
        }

        genera_Ai(As, BS, N+1);

        for (ii = 0; ii < N; ii++) {
            for(jj = 0; jj < BS; jj++) 
                b[BS*ii + jj] = 1;
        }   

        for (ii = 0; ii < BS; ii++) 
            bs[ii] = 1;
    }

    // Send A matrix to all process
    MPI_Scatter(A_data, LD * N / num_nodes, MPI_DOUBLE, 
                A,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);  

    // Send B matrix to all process
    MPI_Scatter(B_data, LD * N / num_nodes, MPI_DOUBLE, 
                B,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);

    // Send C matrix to all process
    MPI_Scatter(C_data, LD * N / num_nodes, MPI_DOUBLE, 
                C,      LD * N / num_nodes, MPI_DOUBLE, 
                MASTER, MPI_COMM_WORLD);

    /* Step #1 
     * Each processor performs the LU decomposition of their matrices Ai 
     * generating the two matrices Li and Ui.
     */
    for (ii = 0; ii < N / num_nodes; ii++) 
        LU (&A[LD*ii], BS);

    /* Step #2 
     * Each processor calculates the term 'di' = Ci * Ai ^ -1 * bi for each 
     * matrix Ai addressed to it using the LU decomposition calculated in step 
     * number 1. Then it calculates the sum.
     */
    // 2.1
    for (ii = 0; ii < N / num_nodes; ii++) {
        elim_avanti   (&A[LD*ii], BS, &b[BS*ii],  1, &y[BS*ii]);
    }

    // 2.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &y[BS*ii],  1, &d[BS*ii]);

     // #2.3 - Product matrix matrix, that is d_tmp = d_tmp + (Ci * di)
    for (ii = 0; ii < N / num_nodes; ii++)
        my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);

    /* Step #3
     * Through a communication type "Reduce" with the sum on the terms 'di', 
     * the master processor get the sum Ci = Ai^-1 * bi, with i from 0 to n-1 
     * and with this calculates b_cap.
     */
    // 3.1 - sum of d_tmp
    MPI_Reduce (d_tmp, b_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #3.2 - Calcs di b_cap = bs - b_tmp
    for (ii = 0; ii < BS; ii++)
        b_cap[ii] = bs[ii] - b_tmp[ii];

    /* Step #4
     * Each processor calculates the term Di = Ci * Ai^-1 * Bi by the 
     * resolution of the n linear systems Ai * xj = Bij
     */
    // 4.1
    for (ii = 0; ii < N / num_nodes; ii++)
        elim_avanti   (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);

    // 4.2
    for (ii = 0; ii < N / num_nodes; ii++)
        sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);

    // #4.3 - Products matrix matrix, that is D_tmp = D_tmp + (Ci * Di)
    for (ii = 0; ii < N / num_nodes; ii++)
        my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);

    /* Step #5
     * Through a communication type "Reduce" with sum over the terms 'di', 
     * the master processor get the sum Ci * Ai^-1 * Bi with i from 0 to n-1 
     * and with this calculates A_cap.
     */
    // 5.1 - sum of D_tmp
    MPI_Reduce (D_tmp, A_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);

    // #5.2 - Calcs of A_cap, that is A_cap = As - A_tmp
    for (ii = 0; ii < BS; ii++)
        for (jj = 0; jj < BS; jj++)
            A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];

    /* Step #6 
     * The master processor solves the linear system A_cap * xs = xs b_cap 
     * to determine.
     */
    LU (A_cap, BS);
    elim_avanti   (A_cap, BS, b_cap,  1, xs_tmp);
    sost_indietro (A_cap, BS, xs_tmp, 1, xs);

    /* Step #7
     * The solution xs is communicated to all processors via a broadcast.
     */
    MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);

    /* Step #8
     * Each processor j-th solves linear systems Ai*xi = bi - Bi*xs
     */
    for (ii = 0; ii < N / num_nodes; ii++) {
        // #8.1 Calcs Bi*xs = x
        my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);

        // #8.2 Calcs bi-x = x
        for (jj = 0; jj < BS; jj++)
            x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];

        // #8.3 Resolve the system Ai*xi = x
        elim_avanti   (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
        sost_indietro (&A[LD*ii], BS, xs_tmp,    1, &x[BS*ii]);
    }

    /* Step #9
     * The components xi are combined into a single vector with a "Gather". 
     * Joined together to form the xs solution to the initial problem.
     */
    MPI_Gather (xi,    BS, MPI_DOUBLE,
                x_res, BS, MPI_DOUBLE,
                MASTER, MPI_COMM_WORLD);

    for (ii = 0; ii < N; ii++){
        for (jj = 0; jj < BS; jj++) {
            if (ii < N)
                xi[BS*ii + jj] = x[BS*ii + jj];
            else
                xi[BS*ii + jj] = xs[jj];
        }
    }

    // Stop timer
    if (my_rank == MASTER) {
        t_end = MPI_Wtime();
        printf("Time is %f\n", t_end - t_start);
        free(A_data); free(B_data); free(C_data);
    }

    free(A); free(A_cap); free(A_tmp); 
    free(D); free(D_tmp); free(d_tmp); free(d);
    free(b); free(b_cap); free(b_tmp);
    free(B); free(C); 
    free(Y); free(y); 
    free(As); free(bs);
    free(x); free(xs); free(xs_tmp); free(x_res);

    MPI_Finalize();
    return  0;
}

感谢Maxim提供的特别建议:)