Question

我正在尝试在两个独立启动的可执行文件之间创建共享的MPI COMM，例如

mpiexec -n 1 ./exe1 
mpiexec -n 1 ./exe2

我使用MPI_Open_port生成端口详细信息，并将其写入exe1中的文件，然后使用exe2读取。接下来是MPI_Comm_connect / MPI_Comm_accept，然后是发送/接收通信（下面的最小示例）。

我的问题是：我们可以通过这种方式将端口信息写入文件吗？或者MPI_Publish_name/MPI_Lookup_name是OpenMPI像this和this中那样工作所必需的？由于超级计算机通常共享一个文件系统，因此这种基于文件的方法似乎更简单，并且可能避免建立服务器。将端口信息写入文件可按预期工作，即创建共享的通信器并使用MPICH（3.2）交换信息，但是在同时使用2.0.1和4.0的OpenMPI时挂起在MPI_Comm_connect / MPI_Comm_accept行。 1（在运行Ubuntu 12.04的本地工作站上）。

其他信息

如果我将MPMD模式与OpenMPI一起使用，

mpiexec -n 1 ./exe1 : -n 1 ./exe2

这可以正常工作，因此与this问题中一样，允许作业共享ompi_global_scope是一个问题。我也尝试添加，

MPI_Info info;
MPI_Info_create(&info);
MPI_Info_set(info, "ompi_global_scope", "true");

，信息已传递到所有命令，但未成功。我没有运行服务器/客户端模型，因为这两个代码同时运行，因此从一个共享一个URL / PID是不理想的，尽管即使使用建议的方法（对于OpenMPI 2.0.1，也无法使用此方法），

mpirun -n 1 --report-pid + ./OpenMPI_2.0.1 0
1234

mpirun -n 1 --ompi-server pid:1234 ./OpenMPI_2.0.1 1

给予

ORTE_ERROR_LOG: Bad parameter in file base/rml_base_contact.c at line 161

This failure appears to be an internal failure;
here's some additional information (which may only be relevant to an
Open MPI developer):

  pmix server init failed
  --> Returned value Bad parameter (-5) instead of ORTE_SUCCESS

以及OpenMPI 4.0.1，

mpirun -n 1 --report-pid + ./OpenMPI_4.0.1 0
1234

mpirun -n 1 --ompi-server pid:1234 ./OpenMPI_4.0.1 1

给予

ORTE_ERROR_LOG: Bad parameter in file base/rml_base_contact.c at line 50

...

A publish/lookup server was provided, but we were unable to connect
to it - please check the connection info and ensure the server
is alive:

使用4.0.1意味着该错误不应与OpenMPI中的this错误有关。

最小代码

    #include "mpi.h"
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <unistd.h>
    #include <iostream>
    #include <fstream>

    using namespace std;

    int main( int argc, char *argv[] )
    {
        int num_errors = 0;
        int rank, size;
        char port1[MPI_MAX_PORT_NAME];
        char port2[MPI_MAX_PORT_NAME];
        MPI_Status status;
        MPI_Comm comm1, comm2;
        int data = 0;

        char *ptr;
        int runno = strtol(argv[1], &ptr, 10);
        for (int i = 0; i < argc; ++i)
            printf("inputs %d %d %s \n", i,runno, argv[i]);

        MPI_Init(&argc, &argv);
        MPI_Comm_size(MPI_COMM_WORLD, &size);
        MPI_Comm_rank(MPI_COMM_WORLD, &rank);

        if (runno == 0)
        {
            printf("0: opening ports.\n");fflush(stdout);
            MPI_Open_port(MPI_INFO_NULL, port1);
            printf("opened port1: <%s>\n", port1);

            //Write port file
            ofstream myfile;
            myfile.open("port");
            if( !myfile )
                    cout << "Opening file failed" << endl;
            myfile << port1 << endl;
            if( !myfile )
                cout << "Write failed" << endl;
            myfile.close();

            printf("Port %s written to file \n", port1); fflush(stdout);

            printf("Attempt to accept port1.\n");fflush(stdout);

            //Establish connection and send data
            MPI_Comm_accept(port1, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &comm1);

            printf("sending 5 \n");fflush(stdout);
            data = 5;
            MPI_Send(&data, 1, MPI_INT, 0, 0, comm1);
            MPI_Close_port(port1);
        }
        else if (runno == 1)
        {

            //Read port file
            size_t   chars_read = 0;  
            ifstream myfile;
            //Wait until file exists and is avaialble
            myfile.open("port");
            while(!myfile){
                myfile.open("port");
                cout << "Opening file failed" << myfile << endl;
                usleep(30000);
            }
            while( myfile && chars_read < 255 ) {
                myfile >> port1[ chars_read ];    
                if( myfile ) 
                     ++chars_read; 

                if( port1[ chars_read - 1 ] == '\n' ) 
                     break;
            }
            printf("Reading port %s from file \n", port1); fflush(stdout);
            remove( "port" );

            //Establish connection and recieve data
            MPI_Comm_connect(port1, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &comm1);
            MPI_Recv(&data, 1, MPI_INT, 0, 0, comm1, &status);
            printf("Received %d 1\n", data); fflush(stdout);

        }

        //Barrier on intercomm before disconnecting
        MPI_Barrier(comm1);
        MPI_Comm_disconnect(&comm1);
        MPI_Finalize();
        return 0;
    }

0和1仅指定在上面的示例中此代码是写入端口文件还是读取端口文件。然后运行，

mpiexec -n 1 ./a.out 0 
mpiexec -n 1 ./a.out 1

我可以将两个单独的可执行文件与MPI_open_port链接，并在文本文件中共享端口信息吗？

其他信息

最小代码

0 个答案: