如何使用并发执行器重新实现它,或者只是更好的方法。意思是线程池执行器。 基本上我希望抓取工具抓取给定的网址,然后可能会跟踪找到另一个网站的网址。
package Mainpackge;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class main {
public static void main(String[] args) {
//List of urs to collect data from
String[] urls = new String[]{
"http://www.answers.com/",
"http://www.britannica.com/",
"https://ie.yahoo.com/?p=us",
"https://en.wikipedia.org/wiki/Main_Page",
"http://ww w.worldbook.com/",
"http://www.computerlanguage.com/",
"http://www.howstuffworks.com/",
"http://www.dmoz.org/Computers/Computer_Science/"
};
// Create and start workers
List<Worker> workers = new ArrayList<>(urls.length);
for (String url : urls) {
Worker w = new Worker(url);
workers.add(w);
new Thread(w).start();
}
// Retrieve results
for (Worker w : workers) {
Elements results = w.waitForResults();
if (results != null)
for (Element result : results) { result.absUrl("a") ;
System.out.println(w.getName()+": "+result.absUrl("href"));
}
else
System.err.println(w.getName()+" had some error!");
}
}
}
class Worker implements Runnable {
private String url;
private Elements results;
private String name;
private static int number = 0;
private final Object lock = new Object();
public Worker(String url) {
this.url = url;
this.name = "Worker-" + (number++);
}
public String getName() {
return name;
}
@Override
public void run() {
try {
Document doc = Jsoup.connect(this.url).get();
Elements links = doc.select("a");
// Update results
synchronized (lock) {
this.results = links;
lock.notifyAll();
}
} catch (IOException e) {
// You should implement a better error handling code..
System.err.println("Error while parsing: "+this.url);
e.printStackTrace();
}
}
public Elements waitForResults() {
synchronized (lock) {
try {
while (this.results == null) {
lock.wait();
}
return this.results;
} catch (InterruptedException e) {
// Again better error handling
e.printStackTrace();
}
return null;
}
}
}
答案 0 :(得分:0)
使用线程的ExecutorService和Callable实现的完整示例。
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
public class ThreadPoolExample {
public static void main(String[] args) throws InterruptedException, ExecutionException {
List<String> urls = Arrays.asList(new String[]{
"http://www.answers.com/",
"http://www.britannica.com/",
"https://ie.yahoo.com/?p=us",
"https://en.wikipedia.org/wiki/Main_Page",
"http://ww w.worldbook.com/",
"http://www.computerlanguage.com/",
"http://www.howstuffworks.com/",
"http://www.dmoz.org/Computers/Computer_Science/"
});
ExecutorService ex = Executors.newFixedThreadPool(10);
ex.awaitTermination(2, TimeUnit.SECONDS);
List<Future<Element>> results = new ArrayList<>();
for (String string : urls) {
results.add(ex.submit(new Crawler(string)));
}
for (Future<Element> future : results) {
// Get will wait for the thread to be done
for (String url : future.get().urls) {
// ADD A NEW THREAD FOR EACH URLS YOU FOUND !
ex.submit(new Crawler(url));
}
}
ex.shutdown();
}
public static class Crawler implements Callable<Element>{
String url;
public Crawler(String url) {
this.url = url;
}
@Override
public Element call() throws Exception {
// Implement your crawling logic and return your elements
return new Element(Arrays.asList(new String[]{"all new urls", "that you found while crwaling"}));
}
}
public static class Element{
List<String> urls;
public Element(List<String> urls) {
this.urls = urls;
}
@Override
public String toString() {
return "Elements found : " + urls.size();
}
}
}