在网址列表中
http://a.com/foo
http://b.com/bar
http://a.com/monkey
http://c.com/prune
http://a.com/bear
http://b.com/walrus
http://b.com/baz
http://b.com/plugh
我希望最大化a.com
对之间的距离,任何一对b.com
等等。这需要便宜但不一定要最佳。 (我使用网址列表从网站a.com
,b.com
,c.com
下载文件,并且不希望访问频率高于所需频率的任何特定网站。在这里,我们将连续3次点击b.com
站点,这应该避免。)
我理想情况下喜欢Java库,但会选择伪代码。 Maximise sum of pairwise distances in array似乎是一个类似的问题,但没有一个简单的答案 - 我只是想要一些足够好的东西"
答案 0 :(得分:0)
由于没有答案,我自己写了。它很粗糙但很有效。它读取一个URL列表,提取主机,计算它们,然后填充一个鸽子洞阵列,其索引与主机的反向频率成比例。
package org.xmlcml.cmine.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
public class URLShuffler {
public static final Logger LOG = Logger.getLogger(URLShuffler.class);
static {
LOG.setLevel(Level.DEBUG);
}
//如果我们需要额外的鸽笼,但似乎没有中等问题
private static int TOL = 1;
private List<String> urls;
private Multiset<String> domains;
private Map<String, Integer> currentIndexByDomain;
private Map<String, Integer> countByDomain;
private List<String> outputUrls;
public URLShuffler() {
}
public void readURLs(List<String> urls) {
this.urls= urls;
domains = HashMultiset.create();
for (String url : urls) {
String domain = getDomain(url);
domains.add(domain);
}
LOG.debug(domains);
}
// this would be better using java.net.URL
private String getDomain(String url) {
int idx = url.indexOf("//");
if (idx != -1) {
url = url.substring(idx+2);
}
idx = url.indexOf("/");
String domain = url.substring(0, idx);
return domain;
}
public List<String> getShuffledUrls() {
currentIndexByDomain = new HashMap<String, Integer>();
countByDomain = new HashMap<String, Integer>();
outputUrls = new ArrayList<String>();
for (int i = 0; i < urls.size() * TOL; i++) {
outputUrls.add("");
}
//这是一个包含Guava排序的便捷方法。
for (Multiset.Entry<String> entry : CMineUtil.getEntriesSortedByCount(domains)) {
LOG.debug(entry);
countByDomain.put(entry.getElement(), entry.getCount());
currentIndexByDomain.put(entry.getElement(), entry.getCount() - 1);
}
for (String url : urls) {
String domain = getDomain(url);
Integer currentIndex = currentIndexByDomain.get(domain);
Integer count = countByDomain.get(domain);
int slot = (urls.size() * currentIndex * TOL) / count;
currentIndexByDomain.put(domain, currentIndex - 1);
addUrl(url, slot);
}
return outputUrls;
}
private void addUrl(String url, int slot) {
boolean filled = fillLower(url, slot);
if (!filled) {
fillUpper(url, slot);
}
}
// if slot is not free run upwards till next free slot
private boolean fillUpper(String url, int slot) {
for (int i = slot; i < outputUrls.size(); i++) {
if (fill(url, i)) {
return true;
}
}
return false;
}
// if slot is not free run downwards till next free slot
private boolean fillLower(String url, int slot) {
for (int i = slot; i >= 0; i--) {
if (fill(url, i)) {
return true;
}
}
return false;
}
private boolean fill(String url, int slot) {
if (outputUrls.get(slot).equals("")) {
outputUrls.set(slot, url);
return true;
}
return false;
}
}
```