Question

我有一个程序，它的核心是处理排序列表（一个中等大小的database和一个大的queries列表），两个列表都是排序的。每个数据库间隔应与所有重叠查询匹配，然后查询列表将以与读入时相同的顺序写出。

这类似于扫描线系列算法（如果可能的话，请在此处更好地使用更好的措辞）。

为了让程序以非常大的输入运行，我希望（1）尽可能地“本地”工作，并且（2）尽快写出数据（即如果不再需要查询的话）应写出来。）

整个任务实施起来有点笨拙，但MWE看起来有点像底部。实际上，数据库并不是那么大，可以将内存加载到间隔树中。但是，处理查询的问题仍然存在。

我现在的问题是：是否有一个优雅的解决方案使用Java 8流，以便我可以从并行性中受益（处理具有多个查询的数据库有点贵）？

我意识到一个挑战是将每个查询记录分组为多个数据库记录。另一个挑战是，一旦查询被完全处理，并且未来的任何人都不会干扰下一个要写出来的结果，就会对结果进行局部合并。

谢谢！

package mwe;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

class MWE {

    // Half-open interval [begin, end)
    public static class Interval {
        String name;
        int begin;
        int end;

        Interval(String name, int begin, int end) {
            this.name = name;
            this.begin = begin;
            this.end = end;
        }

        boolean overlaps(Interval that) {
            return (that.begin < this.end) && (this.begin < that.end);
        }

        @Override
        public String toString() {
            return "Interval [name=" + name + ", begin=" + begin + ", end=" + end + "]";
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + begin;
            result = prime * result + end;
            result = prime * result + ((name == null) ? 0 : name.hashCode());
            return result;
        }

        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (obj == null)
                return false;
            if (getClass() != obj.getClass())
                return false;
            Interval other = (Interval) obj;
            if (begin != other.begin)
                return false;
            if (end != other.end)
                return false;
            if (name == null) {
                if (other.name != null)
                    return false;
            } else if (!name.equals(other.name))
                return false;
            return true;
        }

    }

    // One counter for an interval
    static class IntervalCounter {
        int counter;
        Interval itv;

        IntervalCounter(Interval itv) {
            this.counter = 0;
            this.itv = itv;
        }

        @Override
        public String toString() {
            return "IntervalCounter [counter=" + counter + ", itv=" + itv + "]";
        }

    }

    // DB intervals to come, sorted by begin position
    static List<Interval> inactiveIntervals = new ArrayList<>();
    // Currently active DB intervals, sorted by begin position
    static List<Interval> activeIntervals = new ArrayList<>();
    // Mapping from database to query interval
    static HashMap<Interval, ArrayList<Interval>> dbToQueries = new HashMap<>();
    // Mapping from interval to point into list of outgoing intervals
    static HashMap<Interval, IntervalCounter> itvToCounter = new HashMap<>();
    // List of outgoing qry intervals
    static ArrayList<IntervalCounter> outgoingIntervals = new ArrayList<>();

    static void process(List<Interval> db, List<Interval> qry) {
        inactiveIntervals.addAll(db); // put all into queue

        // Process each query interval
        for (Interval q : qry) {
            assignToIntervals(q);
            processDone(q);
        }

        assignToIntervals(null);
        processDone(null);
    }

    /**
     * Given the current Interval q, process all database intervals for which no more overlap can come
     */
    private static void processDone(Interval q) {
        // Count number of database intervals that are done when q has been processed completely
        int popCount = 0; // number of intervals to pop from front
        for (Interval db : activeIntervals) {
            if (q == null || q.begin >= db.end) {
                System.err.println("Processing in DB " + db.name);
                for (Interval itv : dbToQueries.get(db))
                    System.err.println("  " + itv.name);
                popCount += 1;
            } else {
                break; // cannot guarantee done for next
            }
        }

        // Remove them from the DB list and reduce counters of contained queries
        while (popCount > 0) {
            System.err.println("popping " + activeIntervals.get(0).name);

            final Interval db = activeIntervals.get(0);
            for (IntervalCounter counter : outgoingIntervals) {
                if (counter.itv.overlaps(db))
                    counter.counter -= 1;
            }

            dbToQueries.remove(db);
            activeIntervals.remove(0);
            popCount--;
        }

        // Write out all queries that are marked as done
        while (!outgoingIntervals.isEmpty() && outgoingIntervals.get(0).counter == 0) {
            System.err.println("Writing out query " + outgoingIntervals.get(0).itv.name);
            outgoingIntervals.remove(0);
        }
    }

    private static void assignToIntervals(Interval q) {
        // Activate new DB intervals
        int popCount = 0;
        for (Interval db : inactiveIntervals) {
            if (q == null || q.end > db.begin) { // could overlap
                activeIntervals.add(db);
                dbToQueries.put(db, new ArrayList<>());
                if (q != null) {
                    outgoingIntervals.add(new IntervalCounter(q));
                    itvToCounter.put(q, outgoingIntervals.get(outgoingIntervals.size() - 1));
                }
                popCount++;
            } else {
                break; // cannot pull in more
            }
        }
        // Activate intervals
        while (popCount > 0) {
            inactiveIntervals.remove(0);
            popCount--;
        }
        // Assign to active DB intervals
        if (q == null)
            return;
        for (Interval db : activeIntervals) {
            if (q.overlaps(db)) {
                dbToQueries.get(db).add(q);
                itvToCounter.get(q).counter += 1;
            }
        }
    }

    public static void main(String[] args) throws java.lang.Exception {
        ArrayList<Interval> db = new ArrayList<>();
        db.add(new Interval("db1", 1, 100));
        db.add(new Interval("db2", 95, 190));
        db.add(new Interval("db3", 200, 300));

        ArrayList<Interval> qry = new ArrayList<>();
        qry.add(new Interval("q1", 1, 20));
        qry.add(new Interval("q2", 99, 100));
        qry.add(new Interval("q3", 250, 251));

        // Guarantee: db and qry will always be sorted by begin

        process(db, qry);
    }
}

运行上述程序时的输出如下

Processing in DB db1
  q1
  q2
Processing in DB db2
  q2
popping db1
popping db2
Writing out query q1
Writing out query q2
Processing in DB db3
  q3
popping db3
Writing out query q3

Answer 1

此问题类似于＆＃34; How to perform an outer join on two or more Streams＆＃34;中回答的流外部联接问题。连接实现使用两个阻塞队列。单独的线程填充每个队列，并在数据耗尽时将结束标记放入队列中。基于Spliterator的AbstractSpliterator实现会根据需要消耗每个队列中的值。

您的案例需要不同的tryAdvance实施。接下来是建议的后端流实现。请参阅上面的问题，了解阻塞队列填充部分。由于DB条目和查询的重叠性质，并行化流是复杂的。即便如此，提议的实现支持有限的并行化（参见trySplit中AbstractSpliterator的实现）。

我已经测试了下面使用的概念，但是这段代码没有经过测试。

import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Spliterator;
import java.util.Spliterators.AbstractSpliterator;
import java.util.concurrent.BlockingQueue;
import java.util.function.Consumer;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import org.apache.commons.lang3.tuple.Pair;

public class QueryProcessor {
    private static class SpliteratorImpl extends AbstractSpliterator<Pair<DbEntry, Query>> {
        private final Query queryEos;
        private final DbEntry dbEntryEos;
        private final BlockingQueue<DbEntry> dbQueue;
        private final BlockingQueue<Query> queryQueue;
        private final Deque<Query> activeQueries = new ArrayDeque<>();
        private final Deque<Pair<DbEntry, Query>> working = new ArrayDeque<>();
        private boolean queriesExhausted = false;

        public SpliteratorImpl(long est, int additionalCharacteristics, BlockingQueue<DbEntry> dbQueue, BlockingQueue<Query> queryQueue, DbEntry dbEntryEos, Query queryEos) {
            super(est, additionalCharacteristics);
            this.dbQueue = dbQueue;
            this.queryQueue = queryQueue;
            this.queryEos = queryEos;
            this.dbEntryEos = dbEntryEos;
        }

        @Override
        public boolean tryAdvance(Consumer<? super Pair<DbEntry, Query>> action) {
            try {
                DbEntry entry = null;
                // tryAdvance produces DbEntry-Query pairs. It begins by
                // draining a working queue of DbEntry-Query pairs, one pair
                // for each tryAdvance call. If the working queue is empty,
                // tryAdvance takes an entry from the entryQueue. For each
                // of these entries, tryAdvance extends its active query queue
                // with queries potentially overlapping the entry and purges
                // queries at the front of the queue that do not overlap the
                // entry. Next, tryAdvance pairs the current entry with each
                // active query that it overlaps.
                //
                // If there are no more DB entries or no active queries, the
                // loop terminates and returns false, signaling that the stream
                // as reached its end.

                for (;;) {
                    // If the working queue is not empty, consume the first pair
                    // in the queue and return true. If it is empty move on to
                    // the next step.
                    if (!working.isEmpty()) {
                        Pair<DbEntry, Query> p = working.pop();
                        action.accept(p);
                        return true;
                    }

                    // Take the next entry form the DB entry queue.
                    entry = dbQueue.take();
                    if (entry == dbEntryEos) {
                        // Encountered end-of-stream in DB entries -- we're
                        // done.
                        return false;
                    }

                    // Extend the the active query queue with any queries that
                    // potentially overlap the current DB entry. If the end of
                    // the last query proceeds the end of the DB entry by more
                    // than one unit, it is possible that the next query will
                    // overlap with this last uncovered bit of the entry.
                    while (!queriesExhausted
                        && (activeQueries.peekLast() == null || activeQueries.peekLast().end() + 1 < entry.end())) {
                        Query q = queryQueue.take();
                        if (q == queryEos) {
                            queriesExhausted = true;
                        } else if (q.end() > entry.begin()) {
                            // if the end of this q follows the beginning of the
                            // entry, keep the query; otherwise, discard it (it
                            // won't overlap any of the following entries
                            // either).
                            activeQueries.add(q);
                        } else {
                            // Discard queries whose end proceed the beginning
                            // of the entry interval.
                        }
                    }

                    // Pop any queries in the working queue that proceed the
                    // current entry (there was overlap between them and prior
                    // entries).
                    for (;;) {
                        if (activeQueries.peekFirst() != null && activeQueries.peekFirst().end() <= entry.begin()) {
                            activeQueries.pop();
                        } else {
                            break;
                        }
                    }

                    // If the active query queue is empty, there is nothing left
                    // that I can do.
                    if (!activeQueries.isEmpty()) {
                        // Pair each query in the active query queue with the
                        // current entry if it overlaps the current entry.
                        for (Query q : activeQueries) {
                            if (isOverlapping(entry, q)) {
                                working.add(Pair.of(entry, q));
                            }
                        }
                        continue;
                    }
                    // The active query queue is empty; so, return false to
                    // signal the end of the stream.
                    return false;
                }
            } catch (InterruptedException e) {
                return false;
            }
        }

        private static boolean isOverlapping(DbEntry entry, Query q) {
            return (q.begin() <= entry.begin() && entry.begin() < q.end())
                || q.begin() <= entry.end() && entry.end() < q.end();
        }
    }

    private QueryProcessor() {
    }

    public static Stream<Pair<DbEntry, Query>> stream(long est, int additionalCharacteristics,
            BlockingQueue<DbEntry> dbQueue, BlockingQueue<Query> queryQueue, DbEntry dbEntryEos, Query queryEos) {
        Spliterator<Pair<DbEntry, Query>> spliterator =
            new SpliteratorImpl(est, additionalCharacteristics, dbQueue, queryQueue, dbEntryEos, queryEos);

        return StreamSupport.stream(spliterator, false);
    }
}

Java 8流中的清扫算法

1 个答案: