我有下面的代码来提取给定URL中的页面,但是我不确定如何以树状结构显示它们。
公共类BasicWebCrawler {
private HashSet<String> links;
public BasicWebCrawler() {
links = new HashSet<String>();
}
public void getPageLinks(String URL) {
//4. Check if you have already crawled the URLs
//(we are intentionally not checking for duplicate content in this example)
if (!links.contains(URL)) {
try {
//4. (i) If not add it to the index
if (links.add(URL)) {
System.out.println(URL);
}
//2. Fetch the HTML code
Document document = Jsoup.connect(URL).get();
//3. Parse the HTML to extract links to other URLs
Elements linksOnPage = document.select("a[href^=\"" +URL+ "\"]");
//5. For each extracted URL... go back to Step 4.
for (Element page : linksOnPage) {
getPageLinks(page.attr("abs:href"));
}
} catch (IOException e) {
System.err.println("For '" + URL + "': " + e.getMessage());
}
}
}
public static void main(String[] args) {
//1. Pick a URL from the frontier
new BasicWebCrawler().getPageLinks("https://www.wikipedia.com/");
}
}
答案 0 :(得分:0)
好吧,我想我可以按照您的要求做,当检查站点上的所有链接或站点上没有链接时,递归将完成,但是在Internet中实际上是不可行的,可笑的是,您可以从哪里访问一个站点只需点击第一个未选中的链接即可:
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.HashSet;
public class BasicWebCrawler {
private HashSet<String> links;
public BasicWebCrawler() {
links = new HashSet<String>();
}
public void getPageLinks(String URL, int level) {
//4. Check if you have already crawled the URLs
//(we are intentionally not checking for duplicate content in this example)
if (!links.contains(URL)) {
try {
//4. (i) If not add it to the index
if (links.add(URL)) {
for(int i = 0; i < level; i++) {
System.out.print("-");
}
System.out.println(URL);
}
//2. Fetch the HTML code
Document document = Jsoup.connect(URL).get();
//3. Parse the HTML to extract links to other URLs
Elements linksOnPage = document.select("a[href]");
//5. For each extracted URL... go back to Step 4.
for (Element page : linksOnPage) {
getPageLinks(page.attr("abs:href"), level + 1);
}
} catch (IOException e) {
System.err.println("For '" + URL + "': " + e.getMessage());
}
}
}
public static void main(String[] args) {
//1. Pick a URL from the frontier
new BasicWebCrawler().getPageLinks("http://mysmallwebpage.com/", 0);
}
}