在进行最终缩减(我的程序中的一堆矩阵的总和)时,如下
struct Tomo {
typedef Eigen::Matrix<int, HISTOGRAM_BOXES, HISTOGRAM_BOXES, Eigen::RowMajor> HistoMtx;
HistoMtx exp_val;
HistoMtx u;
[...]
struct buffer_set {
Tomo * X;
Tomo * Y;
Tomo * Z;
} buffers[2];
[...]
if(rank == 0){
/* MASTER NODE */
for(int source=1; source<size; source++){
printf("Reducing from %i\n", source);
for(int i=0;i<env_count;i++){
MPI_Recv(buffers[1].X[i].exp_val.data(), buffers[1].X[i].exp_val.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Y[i].exp_val.data(), buffers[1].Y[i].exp_val.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Z[i].exp_val.data(), buffers[1].Z[i].exp_val.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].X[i].u.data(), buffers[1].X[i].u.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Y[i].u.data(), buffers[1].Y[i].u.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
MPI_Recv(buffers[1].Z[i].u.data(), buffers[1].Z[i].u.size(), MPI_INT, source, 0, MPI_COMM_WORLD, &status);
}
merge_buffers(0, 1);
}
WriteH5File("h5file.h5", 0);
}else{
/* SLAVE NODES */
for(int i=0;i<env_count;i++){
MPI_Send(buffers[0].X[i].exp_val.data(), buffers[0].X[i].exp_val.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Y[i].exp_val.data(), buffers[0].Y[i].exp_val.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Z[i].exp_val.data(), buffers[0].Z[i].exp_val.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].X[i].u.data(), buffers[0].X[i].u.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Y[i].u.data(), buffers[0].Y[i].u.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(buffers[0].Z[i].u.data(), buffers[0].Z[i].u.size(), MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
pbs_mom进程终止。在交互式会话中运行程序时,我在日志中找到以下内容
[compute-35-3.local:01139] [[33012,0],2] ORTED_CMD_PROCESSOR: STUCK IN INFINITE LOOP - ABORTING
[compute-35-3:01139] *** Process received signal ***
我不明白这意味着什么或触发它的是什么。这似乎是OpenMPI内部的。
答案 0 :(得分:0)