我使用OpenMPI在Sun Grid Engine的集群上运行MPI应用程序。
有没有人经历过在应用程序运行时挂起的MPI通信?
例如:排名0的进程调用MPI_Send进行级别1处理,级别1进程调用级别0进程的MPI_Recv。排名正确,标签正确,但是通信不会发生因此应用永远不会终止。
N.B。这些应用程序可以在我的笔记本电脑和其他机器上运行,因此不会出现我的ID错误...此外,这些应用程序第一次在群集上运行,但是当我再次提交完全相同的作业时,MPI通信会挂起如上所述。
如果有人经历过类似的事情,那么任何帮助都会受到赞赏,谢谢。
编辑: 我已经实现了一个非常简单的应用程序示例:
#include <mpi.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <pthread.h>
#include <sys/time.h>
#define NUM_CELLS 5 //Total number of integers to sort
/* Initial process to create numbers and send to first cell */
void *start(void *data)
{
int i, num;
time_t t;
srand((unsigned) time(&t));
//Create array of random numbers and print
for(i = 0; i < NUM_CELLS; i++)
{
num = rand() % 100;
printf("0 SEND\n");
MPI_Send(&num, 1, MPI_INT, 1, 1, MPI_COMM_WORLD);
printf("0 SENT\n");
}
return NULL;
}
/* Process for individual sort cell in sort pump */
void *sort_cell(void *data)
{
int *pos = (int *)data;
int num=2, i;
for(i = 0; i < NUM_CELLS; i++)
{
//Receive num
printf("%d WAIT\n", *pos);
if(*pos == 1)
MPI_Recv(&num, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
else if(*pos == 2)
MPI_Recv(&num, 1, MPI_INT, 1, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
else if(*pos == 3)
MPI_Recv(&num, 1, MPI_INT, 2, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
else if(*pos == 4)
MPI_Recv(&num, 1, MPI_INT, 3, 4, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
else if(*pos == 5)
MPI_Recv(&num, 1, MPI_INT, 4, 5, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
printf("%d RECV\n", *pos);
//Keep larger number and send smaller number to next cell
printf("%d SEND\n", *pos);
if(*pos == 1)
MPI_Send(&num, 1, MPI_INT, 2, 2, MPI_COMM_WORLD);
else if(*pos == 2)
MPI_Send(&num, 1, MPI_INT, 3, 3, MPI_COMM_WORLD);
else if(*pos == 3)
MPI_Send(&num, 1, MPI_INT, 4, 4, MPI_COMM_WORLD);
else if(*pos == 4)
MPI_Send(&num, 1, MPI_INT, 1, 5, MPI_COMM_WORLD);
printf("%d SENT\n", *pos);
}
return NULL;
}
int main(int argc, char **argv)
{
int i;
double elapsedTime;
struct timeval t1, t2;
//Start timer
gettimeofday(&t1, NULL);
int my_rank, provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
//Stop timer
gettimeofday(&t2, NULL);
elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; //sec to ms
elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; //us to ms
printf("Rank %d - Setup time: %f milliseconds\n", my_rank, elapsedTime);
//Start timer
gettimeofday(&t1, NULL);
//Execute processes in parallel according to mapping
if(my_rank == 0)
{
int num_threads = 1;
pthread_t threads[num_threads];
pthread_create(&threads[0], NULL, start, NULL);
for(i = 0; i < num_threads; i++)
(void) pthread_join(threads[i], NULL);
}
if(my_rank == 1)
{
int pos = 1;
int num_threads = 2;
pthread_t threads[num_threads];
pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);
int pos1 = 5;
pthread_create(&threads[1], NULL, sort_cell, (void *) &pos1);
for(i = 0; i < num_threads; i++)
(void) pthread_join(threads[i], NULL);
}
if(my_rank == 2)
{
int pos = 2;
int num_threads = 1;
pthread_t threads[num_threads];
pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);
for(i = 0; i < num_threads; i++)
(void) pthread_join(threads[i], NULL);
}
if(my_rank == 3)
{
int pos = 3;
int num_threads = 1;
pthread_t threads[num_threads];
pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);
for(i = 0; i < num_threads; i++)
(void) pthread_join(threads[i], NULL);
}
if(my_rank == 4)
{
int pos = 4;
int num_threads = 1;
pthread_t threads[num_threads];
pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);
for(i = 0; i < num_threads; i++)
(void) pthread_join(threads[i], NULL);
}
MPI_Barrier(MPI_COMM_WORLD);
//Stop timer
gettimeofday(&t2, NULL);
elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; //sec to ms
elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; //us to ms
printf("Rank %d - Execution time: %f milliseconds\n", my_rank, elapsedTime);
MPI_Finalize();
return 0;
}
根据排名上sort_cells的顺序,它有效,我无法理解为什么......
申请如下: Sort_cell - &gt; Sort_cell - &gt; Sort_cell - &gt; Sort_cell - &gt; Sort_cell - &gt; Sort_cell 排名:0 1 2 3 4 1 (其中每个Sort_cell分别是在指定等级上运行的进程)
如果这些是在分组的等级上运行,即:00111,01233,00112等,它可以工作..但是一旦我在杂散等级上执行(如上例),即:00110,01221,01231等MPI通信挂起。有什么建议?