Question

我正在尝试用Java编写我的第一个Web爬虫。

我以为我有它工作，但是在打印线程细节时我发现只有一个线程被使用。

我启动了我的工作线程（10个线程..但似乎没有开关）。此外，在输出（附加）中，似乎url处理在所有线程开始运行之前开始。

你能告诉我该做什么或做错了吗？

主要课程

public static void main(String[] args) {               
    String url ="https://jsoup.org/";
    Integer workers =10;
    Integer totalToScan=40;

    Set<String> visitedUrls = ConcurrentHashMap.newKeySet();
    LinkedBlockingQueue<String> urlsToCrawlQueue = new LinkedBlockingQueue<>();
    ExecutorService executor = Executors.newFixedThreadPool(workers);

    Crawler crawler = new Crawler(visitedUrls,urlsToCrawlQueue,totalToScan,executor,workers);
    crawler.crawl(url);


    //crawler.getExecutor().shutdown();
    //try {
        //wait until all threads have ended
     //   crawler.getExecutor().awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
    //} catch (InterruptedException ex) {
     //   ex.printStackTrace();
    //}

    System.out.println("Crawling done. set size : " + crawler.getVisitedUrls().size());
    //printing the set
    int i =0;
    for(String x : crawler.getVisitedUrls())
    {
        System.out.println("Link #"+i+" --> " + x);
        i++;
    }
}

抓取工具类

import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.LinkedBlockingQueue;


public class Crawler {

    private Set<String> visitedUrls;
    private LinkedBlockingQueue<String> urlsToCrawlQueue;
    private Integer maxSetSize;
    private ExecutorService executor;
    private Integer numOfWorkers;


    public Crawler(Set<String> visitedUrls, LinkedBlockingQueue<String> urlsToCrawlQueue, Integer maxSetSize, ExecutorService executor, Integer workers) {
        this.visitedUrls = visitedUrls;
        this.urlsToCrawlQueue = urlsToCrawlQueue;
        this.maxSetSize = maxSetSize;
        this.executor = executor;
        this.numOfWorkers = workers;
    }

    public void crawl(String url) {
        Init(url);
    }

    private void Init(String url) {
        for (int i = 0; i < numOfWorkers; i++) {
            CrawlerWorker2 c = new CrawlerWorker2(this);
            executor.execute(c);
        }

        //Inserting the first url to queue. Expecting the threads to start pulling and working on it
        try {
            urlsToCrawlQueue.put(url);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

   executor.shutdown();
        try {
            //wait until all threads have ended
            executor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
        } catch (InterruptedException ex) {
            ex.printStackTrace();
         }


        }

    public Set<String> getVisitedUrls() {
        return visitedUrls;
    }

    public LinkedBlockingQueue<String> getUrlsToCrawlQueue() {
        return urlsToCrawlQueue;
    }

    public Integer getMaxSetSize() {
        return maxSetSize;
    }

    public ExecutorService getExecutor() {
        return executor;
    }

}

抓取工人类

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

public class CrawlerWorker2 implements Runnable {
    private Crawler crawler;

    public CrawlerWorker2(Crawler crawler) {
        this.crawler = crawler;
    }

    public void run() {
        System.out.println("thread " + Thread.currentThread().getName() + " is starting to run");
        doWork();
    }

    private void doWork() {

        while (!crawler.getUrlsToCrawlQueue().isEmpty() && crawler.getVisitedUrls().size() < crawler.getMaxSetSize()) {
            //get url from queue
            String currUrl = null;
            try {
                currUrl = crawler.getUrlsToCrawlQueue().take();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }

            if (currUrl == null || crawler.getVisitedUrls().contains(currUrl)) {
                System.out.println("Url is eather null or has been visited before. Skipping...");
                continue;
            }

            System.out.println("Handling url: " + currUrl );

            Connection con = Jsoup.connect(currUrl).timeout(1000 * 5);
            Connection.Response response = null;
            try {
                response = con.execute();
            } catch (IOException e) {
                System.out.println("Error trying to reach url : " + currUrl);
                continue;
            }
            Document doc = null;
            if (response.statusCode() != 200) {
                continue;
            }
            try {
                doc = con.get();
            } catch (IOException e) {
                System.out.println("Error getting doc from url " + currUrl);
                continue;
            }

            crawler.getVisitedUrls().add(currUrl);
            System.out.println("Current Set size: " + crawler.getVisitedUrls().size());
            String title = doc.title();

            System.out.println("*********************************************************************");
            System.out.println("Worker Number: " + Thread.currentThread().getId() + ", URL : " + currUrl + " title: " + title );
            System.out.println("*********************************************************************");


            Elements links = doc.select("a[href]");
            for (Element link : links) {
                String currentLink = link.attr("abs:href");
                try {
                    crawler.getUrlsToCrawlQueue().put(currentLink);
                } catch (InterruptedException e) {
                    System.out.println("INTERRUPT ERROR!!!");
                    e.printStackTrace();
                }
            }

        }
    }
}

这是输出：

 thread pool-1-thread-7 is starting to run
thread pool-1-thread-8 is starting to run
thread pool-1-thread-3 is starting to run
thread pool-1-thread-4 is starting to run
thread pool-1-thread-2 is starting to run
thread pool-1-thread-1 is starting to run
Handling url: https://jsoup.org/
thread pool-1-thread-5 is starting to run
thread pool-1-thread-9 is starting to run
thread pool-1-thread-6 is starting to run
thread pool-1-thread-10 is starting to run
Current Set size: 1
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/ title: jsoup Java HTML Parser, with best of DOM, CSS, and jquery
*********************************************************************
Url is eather null or has been visited before. Skipping...
Handling url: https://jsoup.org/news/
Current Set size: 2
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/news/ title: News: jsoup Java HTML parser
*********************************************************************
Handling url: https://jsoup.org/bugs
Current Set size: 3
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/bugs title: Bugs: jsoup Java HTML parser
*********************************************************************
Handling url: https://jsoup.org/discussion
Current Set size: 4
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/discussion title: Mailing lists: jsoup HTML parser
*********************************************************************
Handling url: https://jsoup.org/download
Current Set size: 5
*********************************************************************
Worker Number: 18, URL : https://jsoup.org/download title: Download the jsoup HTML parser library
*********************************************************************
Handling url: https://jsoup.org/apidocs/

Answer 1

您只需将一个网址放入队列。

一个线程获取该URL，所有其他线程因为队列为空而退出。

Java多线程Web爬网程序只使用一个线程

1 个答案: