问题:
我正在使用MPI C ++绑定(MPICH 2)编写MPI程序。在下面列出的特定数量的调用函数之后,我在调用MPI::COMM::Create
方法后立即出现死锁(所有正在运行的进程都终止)。
LOCAL_COMM = LOCAL_COMM.Create(localGroup)
需要更加具体。
我的猜测是出现了某种溢出,但我是MPI编程的新手,无法找到问题的根源。提前感谢您的帮助。
代码:
Matrix Matrix::operator* (Matrix &M)
{
//Synchronize all threads
//COMM_WORLD.Barrier();
int hisCol = M.myCol;
int commSize = COMM_WORLD.Get_size();
//Create local Group and Communicator equal to COMM_WORLD
Intracomm LOCAL_COMM = COMM_WORLD.Dup();
Group localGroup(LOCAL_COMM.Get_group());
//Change locals if required
if(commSize > myRow) {
//Make array of ranks to include
vector<int> masRanks(myRow);
for(int i = 0; i < myRow; i++)
masRanks[i] = i;
//Create new localGroup and LOCAL_COMM
localGroup = localGroup.Incl(myRow,&masRanks[0]);
LOCAL_COMM = LOCAL_COMM.Create(localGroup);
if(LOCAL_COMM == COMM_NULL) {
localGroup.Free();
return Matrix(myRow,hisCol);
}
commSize = LOCAL_COMM.Get_size();
}
//Calculate number of Rows displacement and amount to Send per Thread
vector<int> masRows(commSize,0);
vector<int> amntToSend(commSize,0), amntToRecv(commSize,0);
int tmpRows = myRow;
for(int i = commSize; i > 0; i--) {
masRows[i-1] = ((double)tmpRows / i + 0.5);
tmpRows -= masRows[i-1];
amntToSend[i-1] = masRows[i-1] * myCol;
amntToRecv[i-1] = masRows[i-1] * hisCol;
}
//Form array specifying displacement to send to each thread
vector<int> elemDisp(1,0), elemRecvDisp(1,0);
if(myRank == 0)
for(int i = 1; i < commSize; i++) {
elemDisp.push_back(amntToSend[i-1] + elemDisp[i-1]);
elemRecvDisp.push_back(amntToRecv[i-1] + elemRecvDisp[i-1]);
}
//Prepare variables and send / receive left and right matrix
int maxRow = *max_element(masRows.begin(),masRows.end());
Matr tmpData(maxRow*myCol,0);
LOCAL_COMM.Scatterv(&data[0],&amntToSend[0],&elemDisp[0],DOUBLE,&tmpData[0],masRows[myRank]*myCol,DOUBLE,0);
LOCAL_COMM.Bcast(&M.data[0],M.myRow*hisCol,DOUBLE,0);
//Start evaluation of each element
Matr resData(masRows[myRank]*hisCol,0);
for(int row = 0; row < masRows[myRank]; row++)
for(int i = 0; i < hisCol; i++)
for(int j = 0; j < myCol; j++)
resData[row*hisCol + i] += tmpData[row*myCol + j] * M.data[j*hisCol + i];
//Gather all data at 0-thread
Matr outData(myRow*hisCol);
LOCAL_COMM.Gatherv(&resData[0],masRows[myRank]*hisCol,DOUBLE,&outData[0],&amntToRecv[0],&elemRecvDisp[0],DOUBLE,0);
//Return result Matrix-object
Matrix resMatr(myRow, hisCol);
resMatr.data = outData;
//Free group and communicator
localGroup.Free();
LOCAL_COMM.Free();
return resMatr;
}
答案 0 :(得分:0)
明显但无益的答案是你编写了一个程序死锁。可以说这比写一个无死锁的要容易得多。
您是否调查了可能有助于您找到问题的任何工具?工具如:
英特尔的Trace Analyzer(实际上可能基于吸血鬼)
或并行调试器
RogueWave的TotalView
Allinea的DDT
其中至少有一个应该在您的并行编程工具箱中。
答案 1 :(得分:0)
我发现问题出现在我创建新Group和Communicator的方式中。也许COMM_WORLD.Dup()从一开始就是错误的选择,因为在离开函数之后,我真的不知道编译器对它的值做了什么。
无论如何纠正的代码如下所列。
固定代码:
Matrix Matrix::operator* (Matrix &M)
{
int hisCol = M.myCol;
int commSize = COMM_WORLD.Get_size();
//Make array of ranks to include
vector<int> vRanks;
for(int i = 0; i < myRow && i < commSize; i++)
vRanks.push_back(i);
//Create new LocalGroup and LOCAL_COMM
Group LocalGroup = COMM_WORLD.Get_group().Incl(vRanks.size(),&vRanks[0]);
Intracomm LOCAL_COMM = COMM_WORLD.Create(LocalGroup);
COMM_WORLD.Barrier();
//Make unused processes quit
if(LOCAL_COMM == COMM_NULL) {
LocalGroup.Free();
return Matrix(myRow,hisCol);
}
commSize = LOCAL_COMM.Get_size();
//The rest is pretty much the same