我正在尝试用Java编写我的第一个Web爬虫。
我以为我有它工作,但是在打印线程细节时我发现只有一个线程被使用。
我启动了我的工作线程(10个线程..但似乎没有开关)。 此外,在输出(附加)中,似乎url处理在所有线程开始运行之前开始。
你能告诉我该做什么或做错了吗?
主要课程
public static void main(String[] args) {
String url ="https://jsoup.org/";
Integer workers =10;
Integer totalToScan=40;
Set<String> visitedUrls = ConcurrentHashMap.newKeySet();
LinkedBlockingQueue<String> urlsToCrawlQueue = new LinkedBlockingQueue<>();
ExecutorService executor = Executors.newFixedThreadPool(workers);
Crawler crawler = new Crawler(visitedUrls,urlsToCrawlQueue,totalToScan,executor,workers);
crawler.crawl(url);
//crawler.getExecutor().shutdown();
//try {
//wait until all threads have ended
// crawler.getExecutor().awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
//} catch (InterruptedException ex) {
// ex.printStackTrace();
//}
System.out.println("Crawling done. set size : " + crawler.getVisitedUrls().size());
//printing the set
int i =0;
for(String x : crawler.getVisitedUrls())
{
System.out.println("Link #"+i+" --> " + x);
i++;
}
}
抓取工具类
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;
public class Crawler {
private Set<String> visitedUrls;
private LinkedBlockingQueue<String> urlsToCrawlQueue;
private Integer maxSetSize;
private ExecutorService executor;
private Integer numOfWorkers;
public Crawler(Set<String> visitedUrls, LinkedBlockingQueue<String> urlsToCrawlQueue, Integer maxSetSize, ExecutorService executor, Integer workers) {
this.visitedUrls = visitedUrls;
this.urlsToCrawlQueue = urlsToCrawlQueue;
this.maxSetSize = maxSetSize;
this.executor = executor;
this.numOfWorkers = workers;
}
public void crawl(String url) {
Init(url);
}
private void Init(String url) {
for (int i = 0; i < numOfWorkers; i++) {
CrawlerWorker2 c = new CrawlerWorker2(this);
executor.execute(c);
}
//Inserting the first url to queue. Expecting the threads to start pulling and working on it
try {
urlsToCrawlQueue.put(url);
} catch (InterruptedException e) {
e.printStackTrace();
}
executor.shutdown();
try {
//wait until all threads have ended
executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
} catch (InterruptedException ex) {
ex.printStackTrace();
}
}
public Set<String> getVisitedUrls() {
return visitedUrls;
}
public LinkedBlockingQueue<String> getUrlsToCrawlQueue() {
return urlsToCrawlQueue;
}
public Integer getMaxSetSize() {
return maxSetSize;
}
public ExecutorService getExecutor() {
return executor;
}
}
抓取工人类
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
public class CrawlerWorker2 implements Runnable {
private Crawler crawler;
public CrawlerWorker2(Crawler crawler) {
this.crawler = crawler;
}
public void run() {
System.out.println("thread " + Thread.currentThread().getName() + " is starting to run");
doWork();
}
private void doWork() {
while (!crawler.getUrlsToCrawlQueue().isEmpty() && crawler.getVisitedUrls().size() < crawler.getMaxSetSize()) {
//get url from queue
String currUrl = null;
try {
currUrl = crawler.getUrlsToCrawlQueue().take();
} catch (InterruptedException e) {
e.printStackTrace();
}
if (currUrl == null || crawler.getVisitedUrls().contains(currUrl)) {
System.out.println("Url is eather null or has been visited before. Skipping...");
continue;
}
System.out.println("Handling url: " + currUrl );
Connection con = Jsoup.connect(currUrl).timeout(1000 * 5);
Connection.Response response = null;
try {
response = con.execute();
} catch (IOException e) {
System.out.println("Error trying to reach url : " + currUrl);
continue;
}
Document doc = null;
if (response.statusCode() != 200) {
continue;
}
try {
doc = con.get();
} catch (IOException e) {
System.out.println("Error getting doc from url " + currUrl);
continue;
}
crawler.getVisitedUrls().add(currUrl);
System.out.println("Current Set size: " + crawler.getVisitedUrls().size());
String title = doc.title();
System.out.println("*********************************************************************");
System.out.println("Worker Number: " + Thread.currentThread().getId() + ", URL : " + currUrl + " title: " + title );
System.out.println("*********************************************************************");
Elements links = doc.select("a[href]");
for (Element link : links) {
String currentLink = link.attr("abs:href");
try {
crawler.getUrlsToCrawlQueue().put(currentLink);
} catch (InterruptedException e) {
System.out.println("INTERRUPT ERROR!!!");
e.printStackTrace();
}
}
}
}
}
这是输出:
thread pool-1-thread-7 is starting to run
thread pool-1-thread-8 is starting to run
thread pool-1-thread-3 is starting to run
thread pool-1-thread-4 is starting to run
thread pool-1-thread-2 is starting to run
thread pool-1-thread-1 is starting to run
Handling url: https://jsoup.org/
thread pool-1-thread-5 is starting to run
thread pool-1-thread-9 is starting to run
thread pool-1-thread-6 is starting to run
thread pool-1-thread-10 is starting to run
Current Set size: 1
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/ title: jsoup Java HTML Parser, with best of DOM, CSS, and jquery
*********************************************************************
Url is eather null or has been visited before. Skipping...
Handling url: https://jsoup.org/news/
Current Set size: 2
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/news/ title: News: jsoup Java HTML parser
*********************************************************************
Handling url: https://jsoup.org/bugs
Current Set size: 3
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/bugs title: Bugs: jsoup Java HTML parser
*********************************************************************
Handling url: https://jsoup.org/discussion
Current Set size: 4
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/discussion title: Mailing lists: jsoup HTML parser
*********************************************************************
Handling url: https://jsoup.org/download
Current Set size: 5
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/download title: Download the jsoup HTML parser library
*********************************************************************
Handling url: https://jsoup.org/apidocs/
答案 0 :(得分:0)
您只需将一个网址放入队列。
一个线程获取该URL,所有其他线程因为队列为空而退出。