如何使用线程池执行器重新实现它?

时间:2016-03-27 13:30:47

标签: java multithreading web-crawler jsoup threadpoolexecutor

如何使用并发执行器重新实现它,或者只是更好的方法。意思是线程池执行器。 基本上我希望抓取工具抓取给定的网址,然后可能会跟踪找到另一个网站的网址。

package Mainpackge;

import java.io.IOException;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class main {

    public static void main(String[] args) {
        //List of urs to collect  data from
        String[] urls = new String[]{

                "http://www.answers.com/",
                "http://www.britannica.com/",
                "https://ie.yahoo.com/?p=us",
                "https://en.wikipedia.org/wiki/Main_Page",
                "http://ww w.worldbook.com/",
                "http://www.computerlanguage.com/",
                "http://www.howstuffworks.com/",
                "http://www.dmoz.org/Computers/Computer_Science/"
                };

        // Create and start workers
        List<Worker> workers = new ArrayList<>(urls.length);
        for (String url : urls) {
            Worker w = new Worker(url);
            workers.add(w);
            new Thread(w).start();
        }

        // Retrieve results
        for (Worker w : workers) {
            Elements results = w.waitForResults();
            if (results != null)
                for (Element result : results) { result.absUrl("a") ;
                    System.out.println(w.getName()+": "+result.absUrl("href"));
                }

            else
                System.err.println(w.getName()+" had some error!");
        }
    }
}

class Worker implements Runnable {

    private String url;
    private Elements results;
    private String name;
    private static int number = 0;

    private final Object lock = new Object();

    public Worker(String url) {
        this.url = url;
        this.name = "Worker-" + (number++);
    }

    public String getName() {
        return name;
    }

    @Override
    public void run() {
        try {
            Document doc = Jsoup.connect(this.url).get();

            Elements links = doc.select("a");

            // Update results
            synchronized (lock) {
                this.results = links;
                lock.notifyAll();
            }
        } catch (IOException e) {
            // You should implement a better error handling code..
            System.err.println("Error while parsing: "+this.url);
            e.printStackTrace();
        }
    }

    public Elements waitForResults() {
        synchronized (lock) {
            try {
                while (this.results == null) {
                    lock.wait();
                }
                return this.results;
            } catch (InterruptedException e) {
                // Again better error handling
                e.printStackTrace();
            }

            return null;
        }
    }
}

1 个答案:

答案 0 :(得分:0)

使用线程的ExecutorService和Callable实现的完整示例。

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;

public class ThreadPoolExample {
    public static void main(String[] args) throws InterruptedException, ExecutionException {
        List<String> urls = Arrays.asList(new String[]{
                "http://www.answers.com/",
                "http://www.britannica.com/",
                "https://ie.yahoo.com/?p=us",
                "https://en.wikipedia.org/wiki/Main_Page",
                "http://ww w.worldbook.com/",
                "http://www.computerlanguage.com/",
                "http://www.howstuffworks.com/",
                "http://www.dmoz.org/Computers/Computer_Science/"
                });

        ExecutorService ex = Executors.newFixedThreadPool(10);
        ex.awaitTermination(2, TimeUnit.SECONDS);

        List<Future<Element>> results = new ArrayList<>();
        for (String string : urls) {
            results.add(ex.submit(new Crawler(string)));
        }

        for (Future<Element> future : results) {
            // Get will wait for the thread to be done
            for (String url : future.get().urls) {
                // ADD A NEW THREAD FOR EACH URLS YOU FOUND !
                ex.submit(new Crawler(url));
            }
        }
        ex.shutdown();
    }

    public static class Crawler implements Callable<Element>{
        String url;
        public Crawler(String url) {
            this.url = url;
        }
        @Override
        public Element call() throws Exception {
            // Implement your crawling logic and return your elements
            return new Element(Arrays.asList(new String[]{"all new urls", "that you found while crwaling"}));
        }

    }

    public static class Element{
        List<String> urls;
        public Element(List<String> urls) {
            this.urls = urls;
        }
        @Override
        public String toString() {
            return "Elements found : " + urls.size();
        }
    }
}