我应该解决一个有条件的块系统,这里有两个版本,正常工作的串行版本和不兼容的并行版本(带MPI)我不知道为什么......有人可以帮助我找出原因?
=============================================== ==================================== =严重终止您的申请流程之一 =退出代码:11 =清理剩余的过程 =你可以忽略以下的清理消息 ================================================== =================================您的应用程序终止退出字符串:分段错误 (信号11)这通常是指您的应用程序存在问题。 有关调试建议,请参阅常见问题解答页面
串行版:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include "res.h"
// Numero di blocchi
#define N 128
// Dimensione di ogni Blocco
#define BS 256
// Leading Dimension
#define LD BS * BS
/*
*
*/
int main (int argc, char ** argv) {
// Partenza del timer
clock_t t_end, t_start = clock();
double time;
size_t ii, jj;
/*
* Alloco tutta la memoria necessaria per poter svolgere i calcoli e
* popolo le matrici (in ordine) A, B, C, As, bi, bs
*/
double *A = calloc (LD * N, sizeof(*A));
double *B = calloc (LD * N, sizeof(*B));
double *C = calloc (LD * N, sizeof(*C));
double *As = calloc (LD, sizeof(*As));
double *x = calloc (BS * N, sizeof(*x));
double *xs = calloc (BS, sizeof(*xs));
double *B_tmp = calloc (BS * N, sizeof(*x));
double *b = calloc (BS * N, sizeof(*b));
double *bs = calloc (BS, sizeof(*bs));
double *Y = calloc (LD * N, sizeof(*Y));
double *y = calloc (BS * N, sizeof(*y));
double *D = calloc (LD * N, sizeof(*D));
double *d = calloc (BS * N, sizeof(*d));
double *A_cap = calloc (LD, sizeof(*A_cap));
double *A_tmp = calloc (LD, sizeof(*A_tmp));
double *b_cap = calloc (BS, sizeof(*b_cap));
double *b_tmp = calloc (BS, sizeof(*b_tmp));
double *xi = calloc (BS * N, sizeof(*xi));
double *xs_tmp = calloc (BS, sizeof(*xs_tmp));
for (ii = 0; ii < N; ii++){
genera_Ai(&A[LD*ii], BS, ii+1);
genera_Bi(&B[LD*ii], BS, ii+1);
genera_Bi(&C[LD*ii], BS, ii+1);
}
genera_Ai(As, BS, N+2);
for (ii = 0; ii < N; ii++) {
for(jj = 0; jj < BS; jj++)
b[BS*ii + jj] = 1;
}
for (ii = 0; ii < BS; ii++)
bs[ii] = 1;
/* Step #1
* Calcolo la fattorizzazione LU di tutte le matrici A0,..., AN.
* Ogni matrice Ai ha dimensione BS * BS (256 x 256) overo LD.
*/
for (ii = 0; ii < N; ii++)
LU (&A[LD*ii], BS);
/* Troviamo le matrici Di = Ai^-1 * Bi, di = Ai^-1 * bi con la
fattorizzazione LU e eliminazione in avanti e sostituzione all'indietro */
for (ii = 0; ii < N; ii++) {
elim_avanti (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
elim_avanti (&A[LD*ii], BS, &b[BS*ii], 1, &y[BS*ii]);
}
for (ii = 0; ii < N; ii++) {
sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
sost_indietro (&A[LD*ii], BS, &y[BS*ii], 1, &d[BS*ii]);
}
/* Step #2
* Calcolo A_cap e b_cap, eseguendo prima il prodotto matrice matrice e poi
* la sottrazione dalla matrice As | bs (rispettivamente per A_cap e b_cap).
*
* A_tmp = A_tmp + (Ci * Di) ---> A_cap = As - A_tmp
* b_tmp = b_tmp + (Ci * di) ---> b_cap = bs - b_tmp
*/
// #2.1 - Prodotto matrice matrice, ovvero A_tmp = A_tmp + (Ci * Di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, A_tmp, BS);
// #2.2 - Calcolo di A_cap, ovvero A_cap = As - A_tmp
for (ii = 0; ii < BS; ii++)
for (jj = 0; jj < BS; jj++)
A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];
// #2.3 - Prodotto matrice matrice, ovvero b_tmp = b_tmp + (Ci * di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, b_tmp, 1);
// #2.4 - Calcolo di b_cap = bs - b_tmp
for (ii = 0; ii < BS; ii++)
b_cap[ii] = bs[ii] - b_tmp[ii];
/* Step #3
* Risolvo il sistema A_cap * xs = b_cap, dove:
*
* A_cap = (As - Ci * Ai^-1 * Bi)
* b_cap = (bs - Ci * Ai^-1 * bi)
*/
LU (A_cap, BS);
elim_avanti (A_cap, BS, b_cap, 1, xs_tmp);
sost_indietro (A_cap, BS, xs_tmp, 1, xs);
/*
* Step #4
*
* Calcolo le restanti componenti del vettore soluzione x risolvendo
* N sistemi lineari Ai * xi = bi - Bi * xs.
*/
for (ii = 0; ii < N; ii++) {
// #4.1 Calcolo Bi*xs = B_tmp
my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &B_tmp[BS*ii], 1);
// #4.2 Calcolo bi-B_tmp = B_tmp
for (jj = 0; jj < BS; jj++)
B_tmp[BS*ii + jj] = b[BS*ii + jj] - B_tmp[BS*ii + jj];
// #4.3 Risolvo il sistema Ai*xi = x
elim_avanti(&A[LD*ii], BS, &B_tmp[BS*ii], 1, xs_tmp);
sost_indietro(&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
}
// Salvo tutte le soluzioni nel vettore finale xi
for (ii = 0; ii <= N; ii++){
for (jj = 0; jj < BS; jj++) {
if (ii < N)
xi[BS*ii + jj] = x[BS*ii + jj];
else
xi[BS*ii + jj] = xs[jj];
}
}
// Fermo il timer
t_end = clock();
time = (t_end - t_start) / (double) CLOCKS_PER_SEC;
printf("Time is: %f\n", time);
free(A); free(B); free(C); free(As);
free(x); free(xs); free(b); free(bs);
free(Y); free(y); free(D); free(d);
free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
free(xs_tmp); free(xi);
return 0;
}
并行版本:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>
#include "res.h"
// Blocks
#define N 128
// Block Size
#define BS 256
// Leading Dimension
#define LD BS * BS
// The master node
#define MASTER 0
// Globals
/*
*
*/
int main (int argc, char ** argv) {
size_t ii, jj;
int my_rank, num_nodes;
double t_start, t_end;
double *A, *A_recv, *B, *B_recv, *C, *C_recv;
double *As, *x, *xs, *b, *bs;
double *Y, *y, *D, *d, *D_tmp, *d_tmp;
double *A_cap, *A_tmp, *b_cap, *b_tmp;
double *xi, *xs_tmp;
double *x_res;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);
/**************************** From Master *******************************/
if (my_rank == MASTER) {
t_start = MPI_Wtime();
A = calloc (LD * N, sizeof(*A));
B = calloc (LD * N, sizeof(*B));
C = calloc (LD * N, sizeof(*C));
As = calloc (LD, sizeof(*As));
x = calloc (BS * N, sizeof(*x));
xs = calloc (BS, sizeof(*xs));
b = calloc (BS * N, sizeof(*b));
bs = calloc (BS, sizeof(*bs));
Y = calloc (LD * N, sizeof(*Y));
y = calloc (BS * N, sizeof(*y));
D = calloc (LD * N, sizeof(*D));
d = calloc (BS * N, sizeof(*d));
A_cap = calloc (LD, sizeof(*A_cap));
A_tmp = calloc (LD, sizeof(*A_tmp));
b_cap = calloc (BS, sizeof(*b_cap));
b_tmp = calloc (BS, sizeof(*b_tmp));
d_tmp = calloc (BS, sizeof(*d_tmp));
xi = calloc (BS * N, sizeof(*xi));
xs_tmp = calloc (BS, sizeof(*xs_tmp));
x_res = calloc (BS, sizeof(*x_res));
// Initialize the matrices
for (ii = 0; ii < N; ii++){
genera_Ai(&A[LD*ii], BS, ii+1);
genera_Bi(&B[LD*ii], BS, ii+1);
genera_Bi(&C[LD*ii], BS, ii+1);
}
genera_Ai(As, BS, N+1);
for (ii = 0; ii < N; ii++) {
for(jj = 0; jj < BS; jj++)
b[BS*ii + jj] = 1;
}
for (ii = 0; ii < BS; ii++)
bs[ii] = 1;
// Each node will receive (LD * N / num_nodes) entries
A_recv = calloc (LD * N / num_nodes, sizeof(*A_recv));
B_recv = calloc (LD * N / num_nodes, sizeof(*B_recv));
C_recv = calloc (LD * N / num_nodes, sizeof(*C_recv));
// Send A matrix
MPI_Scatter (A, LD * N, MPI_DOUBLE,
A_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send B matrix
MPI_Scatter (B, LD * N, MPI_DOUBLE,
B_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send C matrix
MPI_Scatter (C, LD * N, MPI_DOUBLE,
C_recv, LD * N, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
}
/**************************** From Workers *******************************/
/* Step #1
* Each processor performs the LU decomposition of their matrices Ai
* generating the two matrices Li and Ui.
*/
for (ii = 0; ii < N / num_nodes; ii++)
LU (&A[LD*ii], BS);
/* Step #2
* Each processor calculates the term di = Ci * Ai^-1 * bi for each matrix
* For it meant taking advantage of the LU decomposition calculated in step
* Number 1. Then it calculates the sum.
*/
// 2.1
for (ii = 0; ii < N / num_nodes; ii++)
elim_avanti (&A[LD*ii], BS, &b[BS*ii], 1, &y[BS*ii]);
// 2.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &y[BS*ii], 1, &d[BS*ii]);
// #2.3 - matrix multiplication, d_tmp = d_tmp + (Ci * di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);
MPI_Barrier (MPI_COMM_WORLD);
/* Step #3
* Through a communication type "Reduce" with sum over the terms "di" the
* Master processor obtains the sum Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
* This calculates b_cap.
*/
// 3.1 - sum d_tmp
MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #3.2 - Calc b_cap = bs - b_tmp
for (ii = 0; ii < BS; ii++)
b_cap[ii] = bs[ii] - b_tmp[ii];
/* Step #4
* Each processor calcs the term Di = Ci * Ai^-1 * Bi by the resolution of
* n linear systems Ai * xj = Bij
*/
// 4.1
for (ii = 0; ii < N / num_nodes; ii++)
elim_avanti (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
// 4.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
// #4.3 - Prodotto matrice matrice, ovvero D_tmp = D_tmp + (Ci * Di)
for (ii = 0; ii < N; ii++)
my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);
/* Step #5
* Through a communication type "Reduce" with sum over the terms 'di', the
* Master processor obtains the sum There Ci * Ai ^ -1 * Bi with i from 0 to n-1 and
* This calculates A_cap.
*/
// 5.1 - sum all D_tmp
MPI_Reduce (&D_tmp, &A_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #5.2 - calc A_cap, that is A_cap = As - A_tmp
for (ii = 0; ii < BS; ii++)
for (jj = 0; jj < BS; jj++)
A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];
/* Step #6
* The "master" processor resolve the system A_cap * xs = b_cap
* to dermine xs.
*/
LU (A_cap, BS);
elim_avanti (A_cap, BS, b_cap, 1, xs_tmp);
sost_indietro (A_cap, BS, xs_tmp, 1, xs);
MPI_Barrier (MPI_COMM_WORLD);
/* Step #7
* The solution xs is communicated to all processors via a broadcast.
*/
if (my_rank == MASTER)
MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
/* Step #8
* each processor j-esimo resolve the linear systems Ai*xi = bi - Bi*xs
*/
for (ii = 0; ii < N / num_nodes; ii++) {
// #8.1 Calc Bi*xs = x
my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);
// #8.2 Calc bi-x = x
for (jj = 0; jj < BS; jj++)
x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];
// #8.3 Resolve the system Ai*xi = x
elim_avanti (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
sost_indietro (&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
}
/* Step #9
* The components "xi" are combined into a single vector via a
* Gather type communication. Joined together to form the solution "xs"
* To the initial problem.
*/
MPI_Gather (xi, BS, MPI_DOUBLE,
x_res, BS, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
for (ii = 0; ii < N; ii++){
for (jj = 0; jj < BS; jj++) {
if (ii < N)
xi[BS*ii + jj] = x[BS*ii + jj];
else
xi[BS*ii + jj] = xs[jj];
}
}
// Wait all nodes before to print the result
MPI_Barrier (MPI_COMM_WORLD);
// Stop timer
if (my_rank == MASTER) {
t_end = MPI_Wtime();
printf("Time is %f\n", t_end - t_start);
}
free(A); free(B); free(C); free(As);
free(x); free(xs); free(b); free(bs);
free(Y); free(y); free(D); free(d);
free(A_cap); free(A_tmp); free(b_cap); free(b_tmp);
free(xs_tmp); free(xi);
MPI_Finalize();
return 0;
}
这是res.h http://bpaste.net/show/yAQCCwFsZlQCCiw7GKC9/这是正确的,由老师提供。
我很抱歉我的英文不好和一些意大利语评论,代码应该清楚,我的目标是将代码的串行部分与MPI并行化,处理器的数量总是4的倍数。提前感谢
答案 0 :(得分:1)
你在其他地方张贴了这个:
Fatal error in PMPI_Reduce: Internal MPI error!, error stack:
PMPI_Reduce(1217)..............: MPI_Reduce(sbuf=0x7fffffffda88, rbuf=0x7fffffffda78, count=8388608, MPI_DOUBLE, MPI_SUM, root=0, MPI_COMM_WORLD) failed
MPIR_Reduce_impl(1029).........:
MPIR_Reduce_intra(825).........:
MPIR_Reduce_redscat_gather(300):
MPIR_Localcopy(357)............: memcpy arguments alias each other, dst=0x7fffffffda78 src=0x7fffffffda88 len=67108864
[Inferior 1 (process 4210) exited with code 01]
正如错误消息所示,您已将MPI_Reduce的参数别名化。这是因为您传递指针的地址而不是值,这是向MPI提供数据缓冲区的正确方法,至少在您动态分配缓冲区时。
我认为改变这个
MPI_Reduce (&d_tmp, &b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
到
MPI_Reduce (d_tmp, b_tmp, LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
将解决问题。如果您真的喜欢&amp;,您可以执行以下操作:
MPI_Reduce (&d_tmp[0], &b_tmp[0], LD * N, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
但这没有意义。
如果在请求void *参数时使用&amp; ptr而不是ptr,您可能会发现其他MPI调用是错误的。
答案 1 :(得分:0)
我找到了解决方案。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <mpi.h>
#include "res.h"
// Number of Blocks
#define N 128
// Block Size
#define BS 256
// Leading Dimension
#define LD BS * BS
// The master node
#define MASTER 0
/*
*
*/
int main (int argc, char ** argv) {
size_t ii, jj;
int my_rank, num_nodes;
double t_start, t_end;
double *A, *A_data, *B, *B_data, *C, *C_data;
double *As, *x, *xs, *b, *bs;
double *Y, *y, *D, *d, *D_tmp, *d_tmp;
double *A_cap, *A_tmp, *b_cap, *b_tmp;
double *xi, *xs_tmp;
double *x_res;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &num_nodes);
A = calloc (LD * N / num_nodes, sizeof(*A));
B = calloc (LD * N / num_nodes, sizeof(*B));
C = calloc (LD * N / num_nodes, sizeof(*C));
Y = calloc (LD * N, sizeof(*Y));
y = calloc (BS * N, sizeof(*y));
D = calloc (LD * N, sizeof(*D));
d = calloc (BS * N, sizeof(*d));
b = calloc (BS * N, sizeof(*b));
As = calloc (LD, sizeof(*As));
x = calloc (BS * N, sizeof(*x));
xs = calloc (BS, sizeof(*xs));
bs = calloc (BS, sizeof(*bs));;
A_cap = calloc (LD, sizeof(*A_cap));
A_tmp = calloc (LD, sizeof(*A_tmp));
b_cap = calloc (BS, sizeof(*b_cap));
b_tmp = calloc (BS, sizeof(*b_tmp));
d_tmp = calloc (BS, sizeof(*d_tmp));
D_tmp = calloc (LD, sizeof(*D_tmp));
xi = calloc (BS * N, sizeof(*xi));
xs_tmp = calloc (BS, sizeof(*xs_tmp));
x_res = calloc (BS * num_nodes, sizeof(*x_res));
if (my_rank == MASTER) {
t_start = MPI_Wtime();
A_data = calloc (LD * N, sizeof(*A_data));
B_data = calloc (LD * N, sizeof(*B_data));
C_data = calloc (LD * N, sizeof(*C_data));
// Initialize the matrices
for (ii = 0; ii < N; ii++){
genera_Ai(&A_data[LD*ii], BS, ii+1);
genera_Bi(&B_data[LD*ii], BS, ii+1);
genera_Bi(&C_data[LD*ii], BS, ii+1);
}
genera_Ai(As, BS, N+1);
for (ii = 0; ii < N; ii++) {
for(jj = 0; jj < BS; jj++)
b[BS*ii + jj] = 1;
}
for (ii = 0; ii < BS; ii++)
bs[ii] = 1;
}
// Send A matrix to all process
MPI_Scatter(A_data, LD * N / num_nodes, MPI_DOUBLE,
A, LD * N / num_nodes, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send B matrix to all process
MPI_Scatter(B_data, LD * N / num_nodes, MPI_DOUBLE,
B, LD * N / num_nodes, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
// Send C matrix to all process
MPI_Scatter(C_data, LD * N / num_nodes, MPI_DOUBLE,
C, LD * N / num_nodes, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
/* Step #1
* Each processor performs the LU decomposition of their matrices Ai
* generating the two matrices Li and Ui.
*/
for (ii = 0; ii < N / num_nodes; ii++)
LU (&A[LD*ii], BS);
/* Step #2
* Each processor calculates the term 'di' = Ci * Ai ^ -1 * bi for each
* matrix Ai addressed to it using the LU decomposition calculated in step
* number 1. Then it calculates the sum.
*/
// 2.1
for (ii = 0; ii < N / num_nodes; ii++) {
elim_avanti (&A[LD*ii], BS, &b[BS*ii], 1, &y[BS*ii]);
}
// 2.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &y[BS*ii], 1, &d[BS*ii]);
// #2.3 - Product matrix matrix, that is d_tmp = d_tmp + (Ci * di)
for (ii = 0; ii < N / num_nodes; ii++)
my_gemm (BS, 1, BS, &C[LD*ii], BS, 1.0, &d[BS*ii], 1, d_tmp, 1);
/* Step #3
* Through a communication type "Reduce" with the sum on the terms 'di',
* the master processor get the sum Ci = Ai^-1 * bi, with i from 0 to n-1
* and with this calculates b_cap.
*/
// 3.1 - sum of d_tmp
MPI_Reduce (d_tmp, b_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #3.2 - Calcs di b_cap = bs - b_tmp
for (ii = 0; ii < BS; ii++)
b_cap[ii] = bs[ii] - b_tmp[ii];
/* Step #4
* Each processor calculates the term Di = Ci * Ai^-1 * Bi by the
* resolution of the n linear systems Ai * xj = Bij
*/
// 4.1
for (ii = 0; ii < N / num_nodes; ii++)
elim_avanti (&A[LD*ii], BS, &B[LD*ii], BS, &Y[LD*ii]);
// 4.2
for (ii = 0; ii < N / num_nodes; ii++)
sost_indietro (&A[LD*ii], BS, &Y[LD*ii], BS, &D[LD*ii]);
// #4.3 - Products matrix matrix, that is D_tmp = D_tmp + (Ci * Di)
for (ii = 0; ii < N / num_nodes; ii++)
my_gemm (BS, BS, BS, &C[LD*ii], BS, 1.0, &D[LD*ii], BS, D_tmp, BS);
/* Step #5
* Through a communication type "Reduce" with sum over the terms 'di',
* the master processor get the sum Ci * Ai^-1 * Bi with i from 0 to n-1
* and with this calculates A_cap.
*/
// 5.1 - sum of D_tmp
MPI_Reduce (D_tmp, A_tmp, BS, MPI_DOUBLE, MPI_SUM, MASTER, MPI_COMM_WORLD);
// #5.2 - Calcs of A_cap, that is A_cap = As - A_tmp
for (ii = 0; ii < BS; ii++)
for (jj = 0; jj < BS; jj++)
A_cap[jj+BS*ii] = As[jj+BS*ii] - A_tmp[jj+BS*ii];
/* Step #6
* The master processor solves the linear system A_cap * xs = xs b_cap
* to determine.
*/
LU (A_cap, BS);
elim_avanti (A_cap, BS, b_cap, 1, xs_tmp);
sost_indietro (A_cap, BS, xs_tmp, 1, xs);
/* Step #7
* The solution xs is communicated to all processors via a broadcast.
*/
MPI_Bcast (xs, BS, MPI_DOUBLE, MASTER, MPI_COMM_WORLD);
/* Step #8
* Each processor j-th solves linear systems Ai*xi = bi - Bi*xs
*/
for (ii = 0; ii < N / num_nodes; ii++) {
// #8.1 Calcs Bi*xs = x
my_gemm(BS, 1, BS, &B[LD*ii], BS, 1, xs, 1, &x[BS*ii], 1);
// #8.2 Calcs bi-x = x
for (jj = 0; jj < BS; jj++)
x[BS*ii + jj] = b[BS*ii + jj] -x[BS*ii + jj];
// #8.3 Resolve the system Ai*xi = x
elim_avanti (&A[LD*ii], BS, &x[BS*ii], 1, xs_tmp);
sost_indietro (&A[LD*ii], BS, xs_tmp, 1, &x[BS*ii]);
}
/* Step #9
* The components xi are combined into a single vector with a "Gather".
* Joined together to form the xs solution to the initial problem.
*/
MPI_Gather (xi, BS, MPI_DOUBLE,
x_res, BS, MPI_DOUBLE,
MASTER, MPI_COMM_WORLD);
for (ii = 0; ii < N; ii++){
for (jj = 0; jj < BS; jj++) {
if (ii < N)
xi[BS*ii + jj] = x[BS*ii + jj];
else
xi[BS*ii + jj] = xs[jj];
}
}
// Stop timer
if (my_rank == MASTER) {
t_end = MPI_Wtime();
printf("Time is %f\n", t_end - t_start);
free(A_data); free(B_data); free(C_data);
}
free(A); free(A_cap); free(A_tmp);
free(D); free(D_tmp); free(d_tmp); free(d);
free(b); free(b_cap); free(b_tmp);
free(B); free(C);
free(Y); free(y);
free(As); free(bs);
free(x); free(xs); free(xs_tmp); free(x_res);
MPI_Finalize();
return 0;
}
感谢Maxim提供的特别建议:)