我在两台不同的机器上运行程序。在一个它没有问题的情况下工作正常另一方面,它会导致分段错误。通过调试,我已经找出了故障发生的位置,但我无法弄清楚它发生的合理原因。
在一个函数中,我有以下代码:
pass_particles(particle_grid, particle_properties, input_data, coll_eros_track, collision_number_part, world, grid_rank_lookup, grid_locations);
cout<<"done passing particles"<<endl;
函数pass_particles看起来像:
void pass_particles(map<int,map<int,Particle> > & particle_grid, std::vector<Particle_props> & particle_properties, User_input& input_data, data_tracking & coll_eros_track, vector<int> & collision_number_part, mpi::communicator & world, std::map<int,int> & grid_rank_lookup, map<int,std::vector<double> > & grid_locations)
{
//cout<<"east-west"<<endl;
//east-west exchange (x direction)
map<int, vector<Particle> > particles_to_be_sent_east;
map<int, vector<Particle> > particles_to_be_sent_west;
vector<Particle> particles_received_east;
vector<Particle> particles_received_west;
int counter_x_sent=0;
int counter_x_received=0;
for(grid_iter=particle_grid.begin();grid_iter!=particle_grid.end();grid_iter++)
{
map<int,Particle>::iterator part_iter;
for (part_iter=grid_iter->second.begin();part_iter!=grid_iter->second.end();)
{
if (particle_properties[part_iter->second.global_part_num()].particle_in_box()[grid_iter->first])
{
//decide if a particle has left the box...need to consider whether particle was already outside the box
if ((part_iter->second.position().x()<(grid_locations[grid_iter->first][0]) && part_iter->second.position().x()>(grid_locations[grid_iter->first-input_data.z_numboxes()][0]))
|| (input_data.periodic_walls_x() && (grid_iter->first-floor(grid_iter->first/(input_data.xz_numboxes()))*input_data.xz_numboxes()<input_data.z_numboxes()) && (part_iter->second.position().x()>(grid_locations[input_data.total_boxes()-1][0]))))
{
particles_to_be_sent_west[grid_iter->first].push_back(part_iter->second);
particle_properties[particle_grid[grid_iter->first][part_iter->first].global_part_num()].particle_in_box()[grid_iter->first]=false;
counter_sent++;
counter_x_sent++;
}
else if ((part_iter->second.position().x()>(grid_locations[grid_iter->first][1]) && part_iter->second.position().x()<(grid_locations[grid_iter->first+input_data.z_numboxes()][1]))
|| (input_data.periodic_walls_x() && (grid_iter->first-floor(grid_iter->first/(input_data.xz_numboxes()))*input_data.xz_numboxes())>input_data.xz_numboxes()-input_data.z_numboxes()-1) && (part_iter->second.position().x()<(grid_locations[0][1])))
{
particles_to_be_sent_east[grid_iter->first].push_back(part_iter->second);
particle_properties[particle_grid[grid_iter->first][part_iter->first].global_part_num()].particle_in_box()[grid_iter->first]=false;
counter_sent++;
counter_x_sent++;
}
//select particles in overlap areas to send to neighboring cells
else if ((part_iter->second.position().x()>(grid_locations[grid_iter->first][0]) && part_iter->second.position().x()<(grid_locations[grid_iter->first][0]+input_data.diam_large())))
{
particles_to_be_sent_west[grid_iter->first].push_back(part_iter->second);
counter_sent++;
counter_x_sent++;
}
else if ((part_iter->second.position().x()<(grid_locations[grid_iter->first][1]) && part_iter->second.position().x()>(grid_locations[grid_iter->first][1]-input_data.diam_large())))
{
particles_to_be_sent_east[grid_iter->first].push_back(part_iter->second);
counter_sent++;
counter_x_sent++;
}
++part_iter;
}
else if (particles_received_current[grid_iter->first].find(part_iter->first)!=particles_received_current[grid_iter->first].end())
{
if ((part_iter->second.position().x()>(grid_locations[grid_iter->first][0]) && part_iter->second.position().x()<(grid_locations[grid_iter->first][0]+input_data.diam_large())))
{
particles_to_be_sent_west[grid_iter->first].push_back(part_iter->second);
counter_sent++;
counter_x_sent++;
}
else if ((part_iter->second.position().x()<(grid_locations[grid_iter->first][1]) && part_iter->second.position().x()>(grid_locations[grid_iter->first][1]-input_data.diam_large())))
{
particles_to_be_sent_east[grid_iter->first].push_back(part_iter->second);
counter_sent++;
counter_x_sent++;
}
part_iter++;
}
else
{
particle_grid[grid_iter->first].erase(part_iter++);
counter_removed++;
}
}
}
world.barrier();
mpi::request reqs_x_send[particles_to_be_sent_west.size()+particles_to_be_sent_east.size()];
vector<multimap<int,int> > box_sent_x_info;
box_sent_x_info.resize(world.size());
vector<multimap<int,int> > box_received_x_info;
box_received_x_info.resize(world.size());
int counter_x_reqs=0;
//send particles
for(grid_iter_vec=particles_to_be_sent_west.begin();grid_iter_vec!=particles_to_be_sent_west.end();grid_iter_vec++)
{
if (grid_iter_vec->second.size()!=0)
{
//send a particle. 50 will be "west" tag
if (input_data.periodic_walls_x() && (grid_iter_vec->first-floor(grid_iter_vec->first/(input_data.xz_numboxes()))*input_data.xz_numboxes()<input_data.z_numboxes()))
{
reqs_x_send[counter_x_reqs++]=world.isend(grid_rank_lookup[grid_iter_vec->first + input_data.z_numboxes()*(input_data.x_numboxes()-1)], grid_iter_vec->first + input_data.z_numboxes()*(input_data.x_numboxes()-1), particles_to_be_sent_west[grid_iter_vec->first]);
box_sent_x_info[grid_rank_lookup[grid_iter_vec->first + input_data.z_numboxes()*(input_data.x_numboxes()-1)]].insert(pair<int,int>(world.rank(), grid_iter_vec->first + input_data.z_numboxes()*(input_data.x_numboxes()-1)));
}
else if (!(grid_iter_vec->first-floor(grid_iter_vec->first/(input_data.xz_numboxes()))*input_data.xz_numboxes()<input_data.z_numboxes()))
{
reqs_x_send[counter_x_reqs++]=world.isend(grid_rank_lookup[grid_iter_vec->first - input_data.z_numboxes()], grid_iter_vec->first - input_data.z_numboxes(), particles_to_be_sent_west[grid_iter_vec->first]);
box_sent_x_info[grid_rank_lookup[grid_iter_vec->first - input_data.z_numboxes()]].insert(pair<int,int>(world.rank(),grid_iter_vec->first - input_data.z_numboxes()));
}
}
}
for(grid_iter_vec=particles_to_be_sent_east.begin();grid_iter_vec!=particles_to_be_sent_east.end();grid_iter_vec++)
{
if (grid_iter_vec->second.size()!=0)
{
//send a particle. 60 will be "east" tag
if (input_data.periodic_walls_x() && (grid_iter_vec->first-floor(grid_iter_vec->first/(input_data.xz_numboxes())*input_data.xz_numboxes())>input_data.xz_numboxes()-input_data.z_numboxes()-1))
{
reqs_x_send[counter_x_reqs++]=world.isend(grid_rank_lookup[grid_iter_vec->first - input_data.z_numboxes()*(input_data.x_numboxes()-1)], 2000000000-(grid_iter_vec->first - input_data.z_numboxes()*(input_data.x_numboxes()-1)), particles_to_be_sent_east[grid_iter_vec->first]);
box_sent_x_info[grid_rank_lookup[grid_iter_vec->first - input_data.z_numboxes()*(input_data.x_numboxes()-1)]].insert(pair<int,int>(world.rank(),2000000000-(grid_iter_vec->first - input_data.z_numboxes()*(input_data.x_numboxes()-1))));
}
else if (!(grid_iter_vec->first-floor(grid_iter_vec->first/(input_data.xz_numboxes())*input_data.xz_numboxes())>input_data.xz_numboxes()-input_data.z_numboxes()-1))
{
reqs_x_send[counter_x_reqs++]=world.isend(grid_rank_lookup[grid_iter_vec->first + input_data.z_numboxes()], 2000000000-(grid_iter_vec->first + input_data.z_numboxes()), particles_to_be_sent_east[grid_iter_vec->first]);
box_sent_x_info[grid_rank_lookup[grid_iter_vec->first + input_data.z_numboxes()]].insert(pair<int,int>(world.rank(), 2000000000-(grid_iter_vec->first + input_data.z_numboxes())));
}
}
}
counter=0;
for (int i=0;i<world.size();i++)
{
//if (world.rank()!=i)
//{
reqs[counter++]=world.isend(i,1000000000,box_sent_x_info[i]);
reqs[counter++]=world.irecv(i,1000000000,box_received_x_info[i]);
//}
}
mpi::wait_all(reqs, reqs + world.size()*2);
//receive particles
//receive west particles
for (int j=0;j<world.size();j++)
{
multimap<int,int>::iterator received_info_iter;
for (received_info_iter=box_received_x_info[j].begin();received_info_iter!=box_received_x_info[j].end();received_info_iter++)
{
//receive the message
if (received_info_iter->second<1000000000)
{
//receive the message
world.recv(received_info_iter->first,received_info_iter->second,particles_received_west);
//loop through all the received particles and add them to the particle_grid for this processor
for (unsigned int i=0;i<particles_received_west.size();i++)
{
particle_grid[received_info_iter->second].insert(pair<int,Particle>(particles_received_west[i].global_part_num(),particles_received_west[i]));
if(particles_received_west[i].position().x()>grid_locations[received_info_iter->second][0] && particles_received_west[i].position().x()<grid_locations[received_info_iter->second][1])
{
particle_properties[particles_received_west[i].global_part_num()].particle_in_box()[received_info_iter->second]=true;
}
counter_received++;
counter_x_received++;
}
}
else
{
//receive the message
world.recv(received_info_iter->first,received_info_iter->second,particles_received_east);
//loop through all the received particles and add them to the particle_grid for this processor
for (unsigned int i=0;i<particles_received_east.size();i++)
{
particle_grid[2000000000-received_info_iter->second].insert(pair<int,Particle>(particles_received_east[i].global_part_num(),particles_received_east[i]));
if(particles_received_east[i].position().x()>grid_locations[2000000000-received_info_iter->second][0] && particles_received_east[i].position().x()<grid_locations[2000000000-received_info_iter->second][1])
{
particle_properties[particles_received_east[i].global_part_num()].particle_in_box()[2000000000-received_info_iter->second]=true;
}
counter_received++;
counter_x_received++;
}
}
}
}
mpi::wait_all(reqs_y_send, reqs_y_send + particles_to_be_sent_bottom.size()+particles_to_be_sent_top.size());
mpi::wait_all(reqs_z_send, reqs_z_send + particles_to_be_sent_south.size()+particles_to_be_sent_north.size());
mpi::wait_all(reqs_x_send, reqs_x_send + particles_to_be_sent_west.size()+particles_to_be_sent_east.size());
cout<<"x sent "<<counter_x_sent<<" and received "<<counter_x_received<<" from rank "<<world.rank()<<endl;
cout<<"rank "<<world.rank()<<" sent "<<counter_sent<<" and received "<<counter_received<<" and removed "<<counter_removed<<endl;
cout<<"done passing"<<endl;
}
我只发布了一些代码(因此忽略了一些变量可能看似未定义的事实,因为它们位于我没有发布的代码的一部分中)
当我运行代码时(在它失败的机器上),我得到done passing
但不是done passing particles
我很遗憾在被调用函数的结尾和调用函数中的下一行之间可能导致分段错误,以及为什么它会在一台机器而不是另一台机器上发生。
答案 0 :(得分:3)
如果你在函数的结尾和调用者中的后续行之间崩溃,你可能会崩溃在局部变量的析构函数中。您需要在调试器中运行该程序以找出哪个对象的析构函数崩溃。
答案 1 :(得分:2)
有几种可能性:
尝试在调试器中运行它以找出它实际崩溃的位置。
编辑:
由于您已经提到过您正在使用gcc,请添加-g标志并使用gdb运行它。然后Gdb会告诉你它出错的确切位置(可能是空取消引用)。
答案 2 :(得分:0)
以防万一有人以后再回来。我更新到最新版本的boost mpi(当时),1.50,这个问题消失了。没有多少解决方案,但它确实有效。