随机播放列表可最大化类似元素之间的距离

时间:2016-06-20 22:56:42

标签: shuffle

在网址列表中

http://a.com/foo http://b.com/bar http://a.com/monkey http://c.com/prune http://a.com/bear http://b.com/walrus http://b.com/baz http://b.com/plugh

我希望最大化a.com对之间的距离,任何一对b.com等等。这需要便宜但不一定要最佳。 (我使用网址列表从网站a.comb.comc.com下载文件,并且不希望访问频率高于所需频率的任何特定网站。在这里,我们将连续3次点击b.com站点,这应该避免。)

我理想情况下喜欢Java库,但会选择伪代码。 Maximise sum of pairwise distances in array似乎是一个类似的问题,但没有一个简单的答案 - 我只是想要一些足够好的东西"

1 个答案:

答案 0 :(得分:0)

由于没有答案,我自己写了。它很粗糙但很有效。它读取一个URL列表,提取主机,计算它们,然后填充一个鸽子洞阵列,其索引与主机的反向频率成比例。

package org.xmlcml.cmine.util;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;

public class URLShuffler {

    public static final Logger LOG = Logger.getLogger(URLShuffler.class);
    static {
        LOG.setLevel(Level.DEBUG);
    }

//如果我们需要额外的鸽笼,但似乎没有中等问题

    private static int TOL = 1;
    private List<String> urls;
    private Multiset<String> domains;
    private Map<String, Integer> currentIndexByDomain;
    private Map<String, Integer> countByDomain;
    private List<String> outputUrls;

    public URLShuffler() {

    }

    public void readURLs(List<String> urls) {
        this.urls= urls;
        domains = HashMultiset.create();
        for (String url : urls) {
            String domain = getDomain(url);
            domains.add(domain);
        }
        LOG.debug(domains);
    }

// this would be better using java.net.URL

    private String getDomain(String url) {
        int idx = url.indexOf("//");
        if (idx != -1) {
            url = url.substring(idx+2);
        }
        idx = url.indexOf("/");
        String domain = url.substring(0,  idx);
        return domain;
    }

    public List<String> getShuffledUrls() {
        currentIndexByDomain = new HashMap<String, Integer>();
        countByDomain = new HashMap<String, Integer>();

        outputUrls = new ArrayList<String>();
        for (int i = 0; i < urls.size() * TOL; i++) {
            outputUrls.add("");
        }

//这是一个包含Guava排序的便捷方法。

        for (Multiset.Entry<String> entry : CMineUtil.getEntriesSortedByCount(domains)) {
            LOG.debug(entry);
            countByDomain.put(entry.getElement(), entry.getCount());
            currentIndexByDomain.put(entry.getElement(), entry.getCount() - 1);
        }
        for (String url : urls) {
            String domain = getDomain(url);
            Integer currentIndex = currentIndexByDomain.get(domain);
            Integer count = countByDomain.get(domain);
            int slot = (urls.size() * currentIndex * TOL) / count;
            currentIndexByDomain.put(domain, currentIndex - 1);
            addUrl(url, slot);
        }
        return outputUrls;
    }

    private void addUrl(String url, int slot) {
        boolean filled = fillLower(url, slot);
        if (!filled) {
            fillUpper(url, slot);
        }
    }

// if slot is not free run upwards till next free slot

    private boolean fillUpper(String url, int slot) {
        for (int i = slot; i < outputUrls.size(); i++) {
            if (fill(url, i)) {
                return true;
            }
        }
        return false;
    }

// if slot is not free run downwards till next free slot
    private boolean fillLower(String url, int slot) {
        for (int i = slot; i >= 0; i--) {
            if (fill(url, i)) {
                return true;
            }
        }
        return false;
    }

    private boolean fill(String url, int slot) {
        if (outputUrls.get(slot).equals("")) {
            outputUrls.set(slot, url);
            return true;
        }
        return false;
    }
}

```