我正在按照教程在java中创建一个Web爬虫。当我运行代码时,crawledURL
为null
。 ***格式错误的URL:无限循环中为null。
任何人都可以向我解释为什么会这样吗?
以下是整个代码:
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
import java.net.*;
public class WebCrawler {
public static Queue<String> Queue = new LinkedList<>();
public static Set<String> marked = new HashSet<>();
public static String regex = "http[s]://(\\w+\\.)*(\\w+)";
public static void bfsAlgorithm(String root) throws IOException {
Queue.add(root);
while (!Queue.isEmpty()) {
String crawledURL = Queue.poll();
System.out.println("\n=== Site crawled : " + crawledURL + "===");
//Limiting to a 100 websites here
if(marked.size() > 100)
return;
boolean ok = false;
URL url = null;
BufferedReader br = null;
while (!ok) {
try {
url = new URL(crawledURL);
br = new BufferedReader(new InputStreamReader(url.openStream()));
ok = true;
} catch (MalformedURLException e) {
System.out.println("*** Malformed URL :" + crawledURL);
crawledURL = Queue.poll();
ok = false;
} catch (IOException ioe) {
System.out.println("*** IOException for URL :" + crawledURL);
crawledURL = Queue.poll();
ok = false;
}
}
StringBuilder sb = new StringBuilder();
while((crawledURL = br.readLine()) != null) {
sb.append(crawledURL);
}
crawledURL = sb.toString();
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(crawledURL);
while (matcher.find()){
String w = matcher.group();
if (!marked.contains(w)) {
marked.add(w);
System.out.println("Site added for crawling : " + w);
Queue.add(w);
}
}
}
}
public static void showResults() {
System.out.println("\n\nResults :");
System.out.print("Web sites craweled: " + marked.size() + "\n");
for (String s : marked) {
System.out.println("* " + s);
}
}
public static void main(String[] args) {
try {
bfsAlgorithm("http://www.ssaurel.com/blog");
showResults();
} catch (IOException e) {
//TODO Auto-generated catch block
e.printStackTrace();
}
}
}
答案 0 :(得分:0)
while (!Queue.isEmpty()) {
String crawledURL = Queue.poll();
...
} catch (MalformedURLException e) {
crawledURL = Queue.poll();
第二次不检查队列为空