Question

运行MPI_Scatter的测试代码时，我遇到了一个奇怪的行为。该程序似乎工作正常，但如果节点数大于4，则返回分段错误。我使用mpicxx编译并使用mpirun -n N ./a.o运行。

#include <mpi.h>
#include <vector>
#include <stdio.h>

using std::vector;

int main(void){
    MPI_Init(NULL,NULL);
    int num_PE;
    MPI_Comm_size(MPI_COMM_WORLD, &num_PE);
    int my_PE;
    MPI_Comm_rank(MPI_COMM_WORLD, &my_PE);

    int data_per_PE=2;
    int remainder=0; //conceptually should be less than data_per_PE but shouldn't matter from code perspective
    vector<int> elem_count(num_PE,data_per_PE); //number of elements to scatter
    elem_count[num_PE-1]=data_per_PE+remainder; //let last PE take extra load
    vector<int> start_send(num_PE); //the offset to send from main buffer
    vector<double> small_vec(data_per_PE+remainder); //small place to store values
    vector<double> bigVec; //the big list to distribute to processes
    if (my_PE==0){
        bigVec.reserve(data_per_PE*num_PE+remainder); //make room
        for(int i=0; i<data_per_PE*num_PE+remainder; i++){
            bigVec.push_back(static_cast<double>(i)+1.0); //1,2,3...
            start_send[i]=i*data_per_PE; //the stride
        }
    }
    // MPI_Scatterv(&bigVec[0],&elem_count[0],&start_send[0],MPI_DOUBLE,&small_vec[0],data_per_PE+remainder,MPI_DOUBLE,0,MPI_COMM_WORLD);
    MPI_Scatter(&bigVec[0],data_per_PE,MPI_DOUBLE,&small_vec[0],data_per_PE,MPI_DOUBLE,0,MPI_COMM_WORLD); //scatter
    if (my_PE==0){
        printf("Proc \t elems \n");
    }
    MPI_Barrier(MPI_COMM_WORLD); //let everything catch up before printing
    for (int i=0;i<data_per_PE+remainder;i++){
        printf("%d \t %f \n", my_PE, small_vec[i]); //print the values scattered to each processor
    }

    MPI_Barrier(MPI_COMM_WORLD); //don't think this is necessary but won't hurt
    MPI_Finalize(); //finish

    return 0;
}

Answer 1

您正在写start_send内部存储的末尾，从而破坏堆及其中包含的任何其他对象：

if (my_PE==0){
    bigVec.reserve(data_per_PE*num_PE+remainder); //make room
    for(int i=0; i<data_per_PE*num_PE+remainder; i++){
        bigVec.push_back(static_cast<double>(i)+1.0); //1,2,3...
        start_send[i]=i*data_per_PE; //the stride               <--- HERE
    }
}

i会一直运行到data_per_PE*num_PE+remainder - 1，但start_send只会存储num_PE个元素。写入结束会破坏堆对象的链接列表，当析构函数尝试释放损坏的堆块或访问其他堆对象时，程序可能会出现段错误。

Answer 2

这个问题与分散无关，而是与这一行有关：

Supervisor.start_link(children, opts)

由于start_send[i]=i*data_per_PE;可以超出i，因此您可以在num_PE的范围之外写入 - 覆盖可能属于start_send的某些内存。

通过创建一个真正最小的例子，可以很容易地找到它。

您的代码中还有其他问题：small_vec是&bigVec[0]的问题。虽然非根等级忽略了my_PE!=0的参数，但该语句涉及在MPI_Scatter中取消引用第一个元素。由于向量是空的，这是它自己的未定义行为。 Here is an explanation至于为什么会产生微妙的问题。请改用std::vector::operator[]。

MPI_Scatter分段故障取决于节点号

2 个答案: