Sun Grid Engine集群上的MPI

时间:2016-08-11 14:21:19

标签: mpi cluster-computing openmpi mpich sungridengine

我使用OpenMPI在Sun Grid Engine的集群上运行MPI应用程序。

有没有人经历过在应用程序运行时挂起的MPI通信?

例如:排名0的进程调用MPI_Send进行级别1处理,级别1进程调用级别0进程的MPI_Recv。排名正确,标签正确,但是通信不会发生因此应用永远不会终止。

N.B。这些应用程序可以在我的笔记本电脑和其他机器上运行,因此不会出现我的ID错误...此外,这些应用程序第一次在群集上运行,但是当我再次提交完全相同的作业时,MPI通信会挂起如上所述。

如果有人经历过类似的事情,那么任何帮助都会受到赞赏,谢谢。

编辑: 我已经实现了一个非常简单的应用程序示例:

    #include <mpi.h> 
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <pthread.h>
#include <sys/time.h>

#define NUM_CELLS 5     //Total number of integers to sort

/* Initial process to create numbers and send to first cell */ 
void *start(void *data)
{
    int i, num;
    time_t t;
    srand((unsigned) time(&t));

    //Create array of random numbers and print
    for(i = 0; i < NUM_CELLS; i++) 
    {
        num = rand() % 100;
        printf("0 SEND\n");
        MPI_Send(&num, 1, MPI_INT, 1, 1, MPI_COMM_WORLD); 
        printf("0 SENT\n");
  }
  return NULL;
}

/* Process for individual sort cell in sort pump */
void *sort_cell(void *data)
{   
    int *pos = (int *)data;

    int num=2, i;
    for(i = 0; i < NUM_CELLS; i++)
    {
        //Receive num
        printf("%d WAIT\n", *pos);           
        if(*pos == 1) 
              MPI_Recv(&num, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        else if(*pos == 2)
              MPI_Recv(&num, 1, MPI_INT, 1, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        else if(*pos == 3)
              MPI_Recv(&num, 1, MPI_INT, 2, 3, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        else if(*pos == 4)
              MPI_Recv(&num, 1, MPI_INT, 3, 4, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        else if(*pos == 5)
              MPI_Recv(&num, 1, MPI_INT, 4, 5, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
        printf("%d RECV\n", *pos);

        //Keep larger number and send smaller number to next cell
        printf("%d SEND\n", *pos); 
        if(*pos == 1)
          MPI_Send(&num, 1, MPI_INT, 2, 2, MPI_COMM_WORLD);    
        else if(*pos == 2)
          MPI_Send(&num, 1, MPI_INT, 3, 3, MPI_COMM_WORLD);
        else if(*pos == 3) 
          MPI_Send(&num, 1, MPI_INT, 4, 4, MPI_COMM_WORLD);    
        else if(*pos == 4) 
          MPI_Send(&num, 1, MPI_INT, 1, 5, MPI_COMM_WORLD);
        printf("%d SENT\n", *pos);
  } 
  return NULL;
}

int main(int argc, char **argv)
{
    int i;
    double elapsedTime;
    struct timeval t1, t2;

    //Start timer
    gettimeofday(&t1, NULL);

  int my_rank, provided; 
    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided);  
    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);

    //Stop timer
    gettimeofday(&t2, NULL);

    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      //sec to ms
    elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   //us to ms

    printf("Rank %d - Setup time: %f milliseconds\n", my_rank, elapsedTime);

    //Start timer
    gettimeofday(&t1, NULL);

    //Execute processes in parallel according to mapping
  if(my_rank == 0)
  {
    int num_threads = 1;

    pthread_t threads[num_threads];

    pthread_create(&threads[0], NULL, start, NULL);

    for(i = 0; i < num_threads; i++)
        (void) pthread_join(threads[i], NULL); 
  }
  if(my_rank == 1)
  {
    int pos = 1;
    int num_threads = 2;

    pthread_t threads[num_threads];

    pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);

    int pos1 = 5;
    pthread_create(&threads[1], NULL, sort_cell, (void *) &pos1);

    for(i = 0; i < num_threads; i++)
        (void) pthread_join(threads[i], NULL); 
  }

  if(my_rank == 2)
  {
    int pos = 2;
    int num_threads = 1;

    pthread_t threads[num_threads];

    pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);

    for(i = 0; i < num_threads; i++)
        (void) pthread_join(threads[i], NULL);
  }

  if(my_rank == 3)
  {
    int pos = 3;
    int num_threads = 1;

    pthread_t threads[num_threads];

    pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);

    for(i = 0; i < num_threads; i++)
        (void) pthread_join(threads[i], NULL);
  }

  if(my_rank == 4)
  {
    int pos = 4;
    int num_threads = 1;

    pthread_t threads[num_threads];

    pthread_create(&threads[0], NULL, sort_cell, (void *) &pos);

    for(i = 0; i < num_threads; i++)
        (void) pthread_join(threads[i], NULL);
  }

  MPI_Barrier(MPI_COMM_WORLD);

    //Stop timer
  gettimeofday(&t2, NULL);

    elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0;      //sec to ms
  elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0;   //us to ms

    printf("Rank %d - Execution time: %f milliseconds\n", my_rank, elapsedTime);

    MPI_Finalize();

    return 0;
}

根据排名上sort_cells的顺序,它有效,我无法理解为什么......

申请如下:      Sort_cell - &gt; Sort_cell - &gt; Sort_cell - &gt; Sort_cell - &gt; Sort_cell - &gt; Sort_cell 排名:0 1 2 3 4 1 (其中每个Sort_cell分别是在指定等级上运行的进程)

如果这些是在分组的等级上运行,即:00111,01233,00112等,它可以工作..但是一旦我在杂散等级上执行(如上例),即:00110,01221,01231等MPI通信挂起。有什么建议?

0 个答案:

没有答案