Question

这是我在StackOverflow上的第一个问题:-)很抱歉，如果我以错误的方式发布问题......

这是我的问题：我必须将Recursive Fibonacci算法与不同的并行编程模型进行比较：Cilk，openMP ...和openMPI

Cilk和OpenMP是微不足道的，但openMPI对我来说有点复杂......

我找到了一个使用MPI_Comm_spawn的Recursive Fibonacci的实现，它可以正常工作，但MPI_Comm_spawn原语仅在主节点上创建和执行新进程。因此群集未使用。

所以...我的问题是：有一种方法可以在整个集群上执行生成的进程吗？否则，还有其他解决方案用openMPI实现Recursive Fibonacci？

谢谢你的帮助！： - ）

这是仅在主节点上实际工作的代码：

[MASTER]

int main (int argc, char **argv){
  long n, fibn;
  int world_size, flag;
  int universe_size = 10;
  int myrank;
  char command[] = "slave_fib";
  MPI_Comm children_comm;
  MPI_Status status;
  int errcodes[1];

  MPI_Init (&argc, &argv);
  MPI_Comm_size (MPI_COMM_WORLD, &world_size);
  MPI_Comm_rank (MPI_COMM_WORLD, &myrank);

  MPI_Info local_info;
  MPI_Info_create (&local_info);

  if (world_size != 1)
    perror ("Top heavy with management");

  MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_UNIVERSE_SIZE, &universe_size,              &flag);
  if (universe_size == 1)
    perror ("No room to start workers");

  // Prepare argv for spawning the recursive process
  argv += 1;
  n = atol (argv[0]);

  if (n < 2){
      printf ("fib(%ld)=%ld\n", n, n);
      exit (0);
  }else{
      sprintf (argv[0], "%ld", n);
      MPI_Comm_spawn (command, argv, 1, local_info, myrank, MPI_COMM_SELF,
                      &children_comm, errcodes);
  }
  MPI_Recv (&fibn, 1, MPI_LONG, MPI_ANY_SOURCE, 1, children_comm,
            MPI_STATUS_IGNORE);

  printf ("fib(%ld)=%ld\n", n, fibn);
  fflush(stdout);

  MPI_Finalize ();
}


##### SPAWNED BYNARY #####

int main (int argc, char **argv){
  long n, fibn, x, y;
  int myrank, size;
  char command[] = "slave_fib";
  MPI_Comm children_comm[2];
  MPI_Comm parent;
  MPI_Info local_info;
  int world_size,flag;
  int universe_size=10;
  int errcodes[1];

  MPI_Init (&argc, &argv);
  MPI_Comm_get_parent (&parent);
  MPI_Comm_rank (MPI_COMM_WORLD, &myrank);
  MPI_Info_create (&local_info);

  MPI_Comm_size (MPI_COMM_WORLD, &world_size);

  if (parent == MPI_COMM_NULL)
    perror ("No parent!");

  if (parent != MPI_COMM_NULL)
    MPI_Comm_remote_size (parent, &size);

  if (size != 1)
    perror ("Something's wrong with the parent");

  MPI_Comm_get_attr (MPI_COMM_WORLD, MPI_UNIVERSE_SIZE, &universe_size, &flag);

  argv += 1;
  n = atol (argv[0]);
  if (n < 2){

      MPI_Send (&n, 1, MPI_LONG, 0, 1, parent);

  }else{

      sprintf (argv[0], "%ld", (n - 1));

      MPI_Comm_spawn (command, argv, 1, local_info, myrank,
                      MPI_COMM_SELF, &children_comm[0], errcodes);

      sprintf (argv[0], "%ld", (n - 2));

      MPI_Comm_spawn (command, argv, 1, local_info, myrank,
                      MPI_COMM_SELF, &children_comm[1], errcodes);

      MPI_Recv (&y, 1, MPI_LONG, MPI_ANY_SOURCE, 1,
                children_comm[1], MPI_STATUS_IGNORE);

      fibn = x + y;             // computation

      MPI_Send (&fibn, 1, MPI_LONG, 0, 1, parent);
    }

  MPI_Finalize ();
}

如何执行：mpirun -np 1 bynary name fib_num

执行它的唯一方法是使用-np 1，如果你设置了np＆gt; 1执行将返回错误（对于MPI_Comm_spawn）

Answer 1

在4个节点的集群上重新安装ubuntu 16.04和libopenmpi-dev 1.10.2之后，斐波那契计算似乎有效，并且生成的进程在所有节点上传播。（没有Torque）

但是当我想计算超过10的斐波纳契数时，我收到一些错误...... 1）有时执行会永远等待生成进程的结束 2）有时候我收到这个错误：

 Child job 67 terminated normally, but 1 process returned a non-zero
 exit code..

此外，我在每次执行中收到了很多这样的消息：

[[30037,42],0] dpm_base_disconnect_init: error -12 in isend to process 0

这些消息在计算失败时以及成功结束时出现。我可能是以错误的方式使用comm_spawn和send / recv吗？

如何用openMPI实现递归Fibonacci

1 个答案: