分散矩阵 - MPI

时间:2011-01-15 21:57:50

标签: matrix mpi

我尝试将一个矩阵逐行散布到所有处理器但是它导致了分段错误。我不知道我做错了什么..这是我的代码

      if(rank == 0) {
                    A_row = 10;
                    A_col = 10;
                    /* calculate the strip size */
                    strip_size = A_row / size;

                    /* genarate Matrix A */
                    A = (double **)malloc(sizeof(double*) * 10);
                    int k = 0;
                    for(i = 0; i < 10; i++) {
                            A[i] = (double*)malloc(sizeof(double) * 10);
                            for(j = 0; j < 10; j++) {
                                    A[i][j] = k;
                                    k++;
                                    printf("%lf  ", A[i][j]);
                            }
                            printf("\n");
                    }
            }

            /* Broadcasting the row, column size of Matrix A as well as strip size and Matrix B*/
            MPI_Bcast(&A_row, 1, MPI_INT, 0, MPI_COMM_WORLD);
            MPI_Bcast(&A_col, 1, MPI_INT, 0, MPI_COMM_WORLD);
            MPI_Bcast(&strip_size, 1, MPI_INT, 0, MPI_COMM_WORLD);

            /* defining a datatype for sub-matrix */
            MPI_Type_vector(strip_size, A_col, A_col, MPI_DOUBLE, &strip);
            MPI_Type_commit(&strip);

            strip_A = (double **)malloc(sizeof(double*)*strip_size);
            for(i= 0; i< strip_size; i++) {
                    strip_A[i] = (double*)malloc(sizeof(double)*A_col);
            }

            MPI_Scatter(&A[0][0], 1, strip, &strip_A[0][0], 1, strip, 0, MPI_COMM_WORLD);

            for(i = 0; i < strip_size; i++) {
                    if(i == 0) {
                            printf("rank = %d\n", rank);
                    }
                    for(j = 0; j < A_col; j++) {
                            printf("%lf  ", strip_A[i][j]);
                    }
                    printf("\n");
            }

谁能告诉我有什么问题......

这是我运行时的错误

mpirun -np 2 ./a.out



 0.000000  1.000000  2.000000  3.000000  4.000000  5.000000  6.000000  7.000000  8.000000  9.000000
 10.000000  11.000000  12.000000  13.000000  14.000000  15.000000  16.000000  17.000000  18.000000  19.000000
 20.000000  21.000000  22.000000  23.000000  24.000000  25.000000  26.000000  27.000000  28.000000  29.000000
 30.000000  31.000000  32.000000  33.000000  34.000000  35.000000  36.000000  37.000000  38.000000  39.000000
 40.000000  41.000000  42.000000  43.000000  44.000000  45.000000  46.000000  47.000000  48.000000  49.000000
 50.000000  51.000000  52.000000  53.000000  54.000000  55.000000  56.000000  57.000000  58.000000  59.000000
 60.000000  61.000000  62.000000  63.000000  64.000000  65.000000  66.000000  67.000000  68.000000  69.000000
 70.000000  71.000000  72.000000  73.000000  74.000000  75.000000  76.000000  77.000000  78.000000  79.000000
 80.000000  81.000000  82.000000  83.000000  84.000000  85.000000  86.000000  87.000000  88.000000  89.000000
 90.000000  91.000000  92.000000  93.000000  94.000000  95.000000  96.000000  97.000000  98.000000  99.000000 

 rank = 1
 42.000000  43.000000  44.000000  45.000000  46.000000  47.000000  48.000000  49.000000  0.000000  0.000000
 52.000000  53.000000  54.000000  55.000000  56.000000  57.000000  58.000000  59.000000  0.000000  0.000000
 0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
 0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
 0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000 
[seismicmstm:07338] *** Process received signal *** [seismicmstm:07338] Signal:
 Segmentation fault (11)
 [seismicmstm:07338] Signal code: 
 (128) [seismicmstm:07338] Failing at
 address: (nil)
 -------------------------------------------------------------------------- mpirun noticed that process rank 1 with PID 7338 on node seismicmstm.cluster exited on signal 11 (Segmentation fault).
 --------------------------------------------------------------------------

3 个答案:

答案 0 :(得分:6)

这里有几件事情要发生。好消息是,最难的东西 - 创建mpi数据类型,以及MPI_Scatter调用的基本结构 - 都是正确的。

第一个问题是MPI_Scatter行使用&amp;(A [0] [0]) - 但除了排名为零之外,你还没有设置A指向任何东西!所以你取消引用一个随机指针两次,这就是你的段错误。

正如suszterpatt所建议的那样,一个更微妙的问题是,无法保证您的已分配内存行是连续的,因此即使您修复了上述操作,分散操作也可能无法正常工作。你试图将strip_size * A_col从A中的某个地方发送到strip_A,但是strip_A可能不是连续组成的那么多双 - 它可能是A_col加倍,然后是一些填充,然后A_col加倍 - 或者实际上,各行可以分散在各处。固定的三种方法是,为了方便起见(IMHO):( a)通过创建整个数组然后创建指向正确位置的二维C数组,使数据在内存中连续; (b)一次只发一排;或者(c)创建一个MPI数据类型,实际上反映了数据在内存中的映射方式(可能是随机的)。

使用(a)似乎有效的方法(A_row按大小均分),如下所示:

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>

int main(int argc, char** argv) {
  int rank, size;
  int strip_size, A_row, A_col;
  double **A, **strip_A, *Adata, *stripdata;
  MPI_Datatype strip;
  int i,j;

  MPI_Init(&argc,&argv) ;

  MPI_Comm_rank(MPI_COMM_WORLD,&rank) ;
  MPI_Comm_size(MPI_COMM_WORLD,&size) ;

      if(rank == 0) {
                    A_row = 10;
                    A_col = 10;
                    /* calculate the strip size */
                    strip_size = A_row / size;

                    /* genarate Matrix A */
                    Adata = (double *)malloc(sizeof(double)*A_row*A_col);
                    A = (double **)malloc(sizeof(double*) * A_row);
                    for(i = 0; i < A_row; i++) {
                            A[i] = &(Adata[i*A_col]);
                    }
                    int k = 0;
                    for(i = 0; i < A_row; i++) {
                            for(j = 0; j < A_col; j++) {
                                    A[i][j] = k;
                                    k++;
                            }
                    }
            }

            /* Broadcasting the row, column size of Matrix A as well as strip size and Matrix B*/
            MPI_Bcast(&A_row, 1, MPI_INT, 0, MPI_COMM_WORLD);
            MPI_Bcast(&A_col, 1, MPI_INT, 0, MPI_COMM_WORLD);
            MPI_Bcast(&strip_size, 1, MPI_INT, 0, MPI_COMM_WORLD);

            /* defining a datatype for sub-matrix */
            MPI_Type_vector(strip_size, A_col, A_col, MPI_DOUBLE, &strip);
            MPI_Type_commit(&strip);

            stripdata = (double *)malloc(sizeof(double)*strip_size*A_col);
            strip_A = (double **)malloc(sizeof(double*)*strip_size);
            for(i= 0; i< strip_size; i++) {
                    strip_A[i] = &(stripdata[i*A_col]);
            }

            MPI_Scatter(Adata, 1, strip, &(strip_A[0][0]), 1, strip, 0, MPI_COMM_WORLD);
            //MPI_Scatter(Adata, A_col*strip_size, MPI_DOUBLE, &(strip_A[0][0]), A_col*strip_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);

            for(i = 0; i < strip_size; i++) {
                    if(i == 0) {
                            printf("rank = %d\n", rank);
                    }
                    for(j = 0; j < A_col; j++) {
                            printf("%lf  ", strip_A[i][j]);
                    }
                    printf("\n");
            }

    MPI_Type_free(&strip);
    free(strip_A);
    free(stripdata);
    free(Adata);
    free(A);
    return 0;
}

答案 1 :(得分:1)

我认为最终,你所做错的是将矩阵存储为数组数组。我想你会发现,如果你把它存储在一个单独的数组中(按行主要或列主要顺序,哪个适合你的想象),事情会变得容易多了。

答案 2 :(得分:0)

只需添加MPI_Finalize();对你的命令。 ;) 请参阅下面的代码并输出。输出正确但由于屏障而无法正确打印。要么你可以使用MPI_Barrier()或使用MPI_Isend()和MPI_Irecv()。享受

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>

int main(int argc, char** argv) {
int rank, size;
int strip_size, A_row, A_col;
double **A, **strip_A, *Adata, *stripdata;
MPI_Datatype strip;
int i,j;

MPI_Init(&argc,&argv) ;

MPI_Comm_rank(MPI_COMM_WORLD,&rank) ;
MPI_Comm_size(MPI_COMM_WORLD,&size) ;

  if(rank == 0) {
                A_row = 10;
                A_col = 10;
                /* calculate the strip size */
                strip_size = A_row / size;

                /* genarate Matrix A */
                Adata = (double *)malloc(sizeof(double)*A_row*A_col);
                A = (double **)malloc(sizeof(double*) * A_row);
                for(i = 0; i < A_row; i++) {
                        A[i] = &(Adata[i*A_col]);
                }
                int k = 0;
                for(i = 0; i < A_row; i++) {
                        for(j = 0; j < A_col; j++) {
                                A[i][j] = k;
                                k++;
                        }
                }
        }

        /* Broadcasting the row, column size of Matrix A as well as strip size and Matrix B*/
        MPI_Bcast(&A_row, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast(&A_col, 1, MPI_INT, 0, MPI_COMM_WORLD);
        MPI_Bcast(&strip_size, 1, MPI_INT, 0, MPI_COMM_WORLD);

        /* defining a datatype for sub-matrix */
        MPI_Type_vector(strip_size, A_col, A_col, MPI_DOUBLE, &strip);
        MPI_Type_commit(&strip);

        stripdata = (double *)malloc(sizeof(double)*strip_size*A_col);
        strip_A = (double **)malloc(sizeof(double*)*strip_size);
        for(i= 0; i< strip_size; i++) {
                strip_A[i] = &(stripdata[i*A_col]);
        }

        MPI_Scatter(Adata, 1, strip, &(strip_A[0][0]), 1, strip, 0, MPI_COMM_WORLD);
        //MPI_Scatter(Adata, A_col*strip_size, MPI_DOUBLE, &(strip_A[0][0]), A_col*strip_size, MPI_DOUBLE, 0, MPI_COMM_WORLD);

        for(i = 0; i < strip_size; i++) {
                if(i == 0) {
                        printf("rank = %d\n", rank);
                }
                for(j = 0; j < A_col; j++) {
                        printf("%lf  ", strip_A[i][j]);
                }
                printf("\n");
        }
if(rank == 0){
MPI_Type_free(&strip);
free(strip_A);
free(stripdata);
free(Adata);
free(A);}

 MPI_Finalize();

 return 0;
}

输出

rank = 0
0.000000  1.000000  2.000000  3.000000  4.000000  5.000000  6.000000  7.000000  8.000000  9.000000
rank = 2
 20.000000  21.000000  22.000000  23.000000  24.000000  25.000000  26.000000  27.000000  28.000000  29.000000  
rank = 6
60.000000  61.000000  62.000000  63.000000  64.000000  65.000000  66.000000  67.000000  68.000000  69.000000  
rank = 1
 10.000000  11.000000  12.000000  13.000000  14.000000  15.000000  16.000000  17.000000  18.000000  19.000000  
rank = 3
30.000000  31.000000  32.000000  33.000000  34.000000  35.000000  36.000000  37.000000  38.000000  39.000000  
rank = 5
50.000000  51.000000  52.000000  53.000000  54.000000  55.000000  56.000000  57.000000  58.000000  59.000000  
rank = 8
80.000000  81.000000  82.000000  83.000000  84.000000  85.000000  86.000000  87.000000  88.000000  89.000000  
rank = 7
70.000000  71.000000  72.000000  73.000000  74.000000  75.000000  76.000000  77.000000  78.000000  79.000000  
rank = 9
90.000000  91.000000  92.000000  93.000000  94.000000  95.000000  96.000000  97.000000  98.000000  99.000000  
rank = 4
40.000000  41.000000  42.000000  43.000000  44.000000  45.000000  46.000000  47.000000  48.000000  49.000000