给定link
,比如Java.sg/downloads
,我需要抓取各种zip文件并下载它们。是否存在apache commons library
来实现此任务?
答案 0 :(得分:0)
我不知道是否有Apache库,但是我使用htmlunit来抓取页面及其所有子页面,其代码如下所示。然后可以通过URLConnection完成下载,例如, this page
public static void walkAllHtmlPages(final String startURL) throws IOException, SAXException {
final WebClient webClient = createWebClient();
try {
final HtmlPage page = webClient.getPage(startURL);
try {
Set visitedURLs = new HashSet();
List links = page.getAnchors();
// now recursively walk all pages
recursivelyFollowLinks(webClient, links, visitedURLs);
} finally {
if(page != null) {
page.cleanUp();
}
}
} finally {
webClient.closeAllWindows();
}
}
public static WebClient createWebClient() {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_3_6);
webClient.setTimeout(30000);
webClient.setJavaScriptEnabled(false);
webClient.setCssEnabled(true);
webClient.setAppletEnabled(true);
webClient.setRedirectEnabled(true); // follow old-school HTTP 302 redirects - standard behaviour
webClient.setHTMLParserListener(null);
webClient.setIncorrectnessListener(new IncorrectnessListener() {
@Override
public void notify(String message, Object origin) {
// Swallow for now, but maybe collect it for optional retrieval?
}
});
webClient.setCssErrorHandler(new SilentCssErrorHandler());
return webClient;
}
private static void recursivelyFollowLinks(WebClient webClient, List links, Set visitedURLs) throws SAXException, IOException {
try {
for(HtmlAnchor link : links) {
String url = link.getHrefAttribute();
if (!visitedURLs.contains(url)) {
visitedURLs.add(url);
visitSubLink(webClient, visitedURLs, link, url);
}
}
} catch (RuntimeException e) {
throw new IllegalArgumentException("While retrieving links: " + getLinksAsString(links), e);
}
}
private static void visitSubLink(WebClient webClient,
Set visitedURLs, HtmlAnchor link, String url) throws IOException, SAXException {
URL current = link.getPage().getUrl();
try {
HtmlPage ret = (HtmlPage)link.click();
List sublinks = ret.getAnchors();
recursivelyFollowLinks(webClient, sublinks, visitedURLs);
} catch (RuntimeException e) { // NOPMD
throw new RuntimeException("While clicking link: " + link.getId() + " to " + url, e);
}
}
答案 1 :(得分:-1)
只需使用wget!
Linux wget示例: http://linuxreviews.org/quicktips/wget/
Wget for Windows: http://gnuwin32.sourceforge.net/packages/wget.htm