Question

我是MVAPICH2的新用户，当我开始使用它时遇到了麻烦首先，我认为我已经成功安装了它，通过这个：
    ./configure --disable-fortran --enable-cuda
    make -j 4
    make install
没有错误。

但是当我试图在示例目录中运行cpi的示例时，我遇到了这样的情况：

我可以通过没有密码的ssh连接节点gpu-cluster-1和gpu-cluster-4;
我使用mpirun_rsh在gpu-cluster-1和gpu-cluster-4上单独运行cpi示例，它运行正常，就像这样：
run @gpu-cluster-1：〜/ mvapich2-2.1rc1 / examples $ mpirun_rsh -ssh -np 2 gpu-cluster-1 gpu-cluster-1 ./cpi
进程0 of 2在gpu-cluster-1上过程1的2在gpu-cluster-1上 pi约为3.1415926544231318，错误为0.0000000008333387
挂钟时间= 0.000089

运行@gpu-cluster-4：〜/ mvapich2-2.1rc1 / examples $ mpirun_rsh -ssh -np 2 gpu-cluster-4 gpu-cluster-4 ./cpi
进程0 of 2在gpu-cluster-4上过程1的2在gpu-cluster-4上 pi约为3.1415926544231318，错误为0.0000000008333387
挂钟时间= 0.000134
我使用mpiexec在gpu-cluster-1和gpu-cluster-4上运行cpi示例，它运行正常，就像这样：
run @gpu-cluster-1：〜/ mvapich2-2.1rc1 / examples $ mpiexec -np 2 -f hostfile ./cpi
进程0 of 2在gpu-cluster-1上过程1的2在gpu-cluster-4上 pi约为3.1415926544231318，错误为0.0000000008333387
挂钟时间= 0.000352
hostfile中的内容为＆＃34; gpu-cluster-1 \ ngpu-cluster-4＆＃34;
但是，当我运行cpi示例时，使用mpirun_rsh，在gpu-cluster-1和gpu-cluster-4上使用，问题出现了：

run @gpu-cluster-1：〜/ mvapich2-2.1rc1 / examples $ mpirun_rsh -ssh -np 2 -hostfile hostfile ./cpi 过程1的2在gpu-cluster-4上 -----------------它停留在这里，没有继续------------------------
很长一段时间后，我按下Ctrl + C，然后显示：

^ C [gpu-cluster-1：mpirun_rsh] [signal_processor]抓到信号2，查杀作业
run @gpu-cluster-1：〜/ mvapich2-2.1rc1 / examples $ [gpu-cluster-4：mpispawn_1] [read_size]文件描述符上意外的文件结束6. MPI进程死了？
[gpu-cluster-4：mpispawn_1] [read_size]文件描述符上出现意外的文件结束6. MPI进程死了？
[gpu-cluster-4：mpispawn_1] [handle_mt_peer]读取PMI套接字时出错。 MPI过程死了？
[gpu-cluster-4：mpispawn_1] [report_error] connect（）失败：拒绝连接（111）
我已经困惑了很长时间，你能帮我解决一下这个问题吗？

以下是cpi示例的代码：



#include "mpi.h"
#include <stdio.h>
#include <math.h>

double f(double);

double f(double a)
{
    return (4.0 / (1.0 + a*a));
}

int main(int argc,char *argv[])
{
    int    n, myid, numprocs, i;
    double PI25DT = 3.141592653589793238462643;
    double mypi, pi, h, sum, x;
    double startwtime = 0.0, endwtime;
    int    namelen;
    char   processor_name[MPI_MAX_PROCESSOR_NAME];

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&myid);
    MPI_Get_processor_name(processor_name,&namelen);

    fprintf(stdout,"Process %d of %d is on %s\n",
    myid, numprocs, processor_name);
    fflush(stdout);

    n = 10000;          /* default # of rectangles */
    if (myid == 0)
    startwtime = MPI_Wtime();

    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

    h   = 1.0 / (double) n;
    sum = 0.0;
    /* A slightly better approach starts from large i and works back */
    for (i = myid + 1; i <= n; i += numprocs)
    {
        x = h * ((double)i - 0.5);
        sum += f(x);
    }
    mypi = h * sum;

    MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    if (myid == 0) {
        endwtime = MPI_Wtime();
        printf("pi is approximately %.16f, Error is %.16f\n",
               pi, fabs(pi - PI25DT));
        printf("wall clock time = %f\n", endwtime-startwtime);         
        fflush(stdout);
    }

    MPI_Finalize();
    return 0;
}

#include "mpi.h"
#include <stdio.h>
#include <math.h>

double f(double);

double f(double a)
{
    return (4.0 / (1.0 + a*a));
}

int main(int argc,char *argv[])
{
    int    n, myid, numprocs, i;
    double PI25DT = 3.141592653589793238462643;
    double mypi, pi, h, sum, x;
    double startwtime = 0.0, endwtime;
    int    namelen;
    char   processor_name[MPI_MAX_PROCESSOR_NAME];

    MPI_Init(&argc,&argv);
    MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD,&myid);
    MPI_Get_processor_name(processor_name,&namelen);

    fprintf(stdout,"Process %d of %d is on %s\n",
    myid, numprocs, processor_name);
    fflush(stdout);

    n = 10000;          /* default # of rectangles */
    if (myid == 0)
    startwtime = MPI_Wtime();

    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);

    h   = 1.0 / (double) n;
    sum = 0.0;
    /* A slightly better approach starts from large i and works back */
    for (i = myid + 1; i <= n; i += numprocs)
    {
        x = h * ((double)i - 0.5);
        sum += f(x);
    }
    mypi = h * sum;

    MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    if (myid == 0) {
        endwtime = MPI_Wtime();
        printf("pi is approximately %.16f, Error is %.16f\n",
               pi, fabs(pi - PI25DT));
        printf("wall clock time = %f\n", endwtime-startwtime);         
        fflush(stdout);
    }

    MPI_Finalize();
    return 0;
}

由mpirun_rsh运行MVAPICH2的cpi示例失败

0 个答案: