使用OpenMPI广播数据发生了死锁

时间:2017-11-16 13:34:21

标签: java multithreading mpi distributed-computing openmpi

我用Java编写了一个程序,它在单线程中调用OpenMPI进行通信。 Isend / recv 用于防止死锁。调用者调用send方法,然后将所有发送请求放入队列中。网络线程从队列获取请求发送。

class NetworkThread extends Thread {
    private final ConcurrentLinkedQueue<SendRequest> sendQueue = new ConcurrentLinkedQueue<>();
    private final List<Request> activeSends = new LinkedList<>();
    private final List<RecvRequest> recvList = new LinkedList<>();
    private volatile boolean shutdown;

    @Override
    public void run() {
        System.out.println("network thread started");
        try {
            loop();
        } catch (MPIException e) {
            e.printStackTrace();
        }
    }

    void loop() throws MPIException {
        while (!shutdown) {
            Status status = MPI.COMM_WORLD.iProbe(MPI.ANY_SOURCE, MPI.ANY_TAG);
            if (status != null) {
                int source = status.getSource();
                int tag = status.getTag();
                int sizeInBytes = status.getCount(MPI.BYTE);

                ByteBuffer buffer = MPI.newByteBuffer(sizeInBytes);
                MPI.COMM_WORLD.recv(buffer, sizeInBytes, MPI.BYTE, source, tag);
                byte[] data = new byte[sizeInBytes];
                buffer.get(data);
                RecvRequest recvRequest = new RecvRequest(data, source, tag);
                synchronized (recvList) {
                    recvList.add(recvRequest);
                }
            }

            SendRequest sendRequest;
            while ((sendRequest = sendQueue.poll()) != null) {
                byte[] data = sendRequest.getData();
                ByteBuffer buffer = MPI.newByteBuffer(data.length);
                buffer.put(data);
                Request request = MPI.COMM_WORLD.iSend(buffer, data.length, MPI.BYTE, sendRequest.getDest(), sendRequest.getTag());
                synchronized (activeSends) {
                    activeSends.add(request);
                }
            }
            //delete sent record
            synchronized (activeSends) {
                Iterator<Request> iterator = activeSends.iterator();
                while (iterator.hasNext()) {
                    Request request = iterator.next();
                    if (request.test())
                        iterator.remove();
                }
            }
        }
    }

    public void send(byte[] data, int dest, int tag) {
        SendRequest sendRequest = new SendRequest(data, dest, tag);
            sendQueue.add(sendRequest);
    }

    public byte[] read(int source, int tag) {
        byte[] data;
        while ((data = tryRead(source, tag)) == null) {
            try {
                Thread.sleep(10);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
        return data;
    }

    public byte[] tryRead(int source, int tag) {
        byte[] data = null;
        synchronized (recvList) {
            Iterator<RecvRequest> iterator = recvList.iterator();
            while (iterator.hasNext()) {
                RecvRequest recvRequest = iterator.next();
                if (recvRequest.getSource() == source && recvRequest.getTag() == tag) {
                    iterator.remove();
                    data = recvRequest.getData();
                    break;//just get one
                }
            }
        }
        return data;
    }

    public void shutdown() {
        shutdown = true;
        //waiting for all sent
        synchronized (activeSends) {
            while (activeSends.size() > 0)
                try {
                    Thread.sleep(1);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
        }
    }
}

class SendRequest {
    private byte[] data;
    private int dest;
    private int tag;

    SendRequest(byte[] data, int dest, int tag) {
        this.data = data;
        this.dest = dest;
        this.tag = tag;
    }

    public int getTag() {
        return tag;
    }

    public int getDest() {
        return dest;
    }

    public byte[] getData() {
        return data;
    }
}

class RecvRequest {
    private byte[] data;
    private int source;
    private int tag;

    RecvRequest(byte[] data, int source, int tag) {
        this.data = data;
        this.source = source;
        this.tag = tag;
    }

    public int getTag() {
        return tag;
    }

    public int getSource() {
        return source;
    }

    public byte[] getData() {
        return data;
    }
}

我还写了一个测试用例,它在随机时间间隔内为每个进程实现了五次广播数据(除了我自己和主人)。希望所有进程都完成发送/接收任务然后退出。

public class BroadcastTest {
    private static final int TAG_MPI = 123;
    static int rank;
    static String host;

    static {
        try {
            host = InetAddress.getLocalHost().getHostName();
        } catch (UnknownHostException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws MPIException, InterruptedException, UnknownHostException {
        MPI.Init(args);
        int rank = MPI.COMM_WORLD.getRank();
        int size = MPI.COMM_WORLD.getSize();
        BroadcastTest.rank = rank;
        if (rank == 0) {
            System.out.println(String.format("total %d machines", size));
            System.out.println("master started");
        } else {
            NetworkThread networkThread = new NetworkThread();
            networkThread.start();

            Thread sendTh = new Thread(() -> {
                for (int i = 0; i < 5; i++) {
                    try {
                        Thread.sleep((long) (Math.random() * 1000)); //send data five times in random interval
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    for (int machineId = 1/* skip master */; machineId < size; machineId++) {
                        if (machineId == rank) continue;//skip myself
                        networkThread.send(new byte[4096], machineId, TAG_MPI); //send 4K bytes data
                    }
                }
            });
            //receive data
            Thread recvTh = new Thread(() -> {
                for (int i = 0; i < 5; i++) {
                    for (int machineId = 1; machineId < size; machineId++) {
                        if (machineId == rank) continue;
                        byte[] bytes = networkThread.read(machineId, TAG_MPI);
                    }
                }
            });
            sendTh.start();
            recvTh.start();
            sendTh.join();
            recvTh.join();
            networkThread.shutdown();
            networkThread.join();
        }
        System.out.println(String.format("%s exit", host));
        MPI.Finalize();
    }
}

命令行:

/home/gongsf/openmpi-2.1.2/bin/mpirun --prefix /home/gongsf/openmpi-2.1.2 -bycore -nooversubscribe -machinefile /home/gongsf/JavaMPI/myhosts /home/gongsf/jdk1.8.0_144/bin/java -classpath /home/gongsf/JavaMPI/lib/*:/home/gongsf/JavaMPI/out/production/JavaMPI BroadcastTest

问题:此程序在较低的流程中正常工作,例如槽= 4。但是当增加要发送的插槽或数据大小时,程序偶尔会进入死锁状态。我试图改变OpenMPI版本(3.0,2.1.2,1.7.5),但似乎不起作用。

0 个答案:

没有答案