
时间:2014-03-03 08:26:45

标签: java recursion web-crawler


    public class WebCrawler {

   private static String url;
   private static int maxCrawlDepth;
   private static String filePath;

   /* Recursive function that crawls all web pages found on a given web page.
    * This function also saves elements from the DownloadRepository to disk.

  public static void crawling(WebPage webpage, int currentCrawlDepth, int maxCrawlDepth) {


     HashMap<String, WebPage> pages = webpage.getCrawledWebPages();

        if(currentCrawlDepth < maxCrawlDepth) {
           for(WebPage wp : pages.values()) {
              crawling(wp, currentCrawlDepth+1, maxCrawlDepth);

   public static void main(String[] args) {

      if(args.length != 3) {
         System.out.println("Must pass three parameters");

      url = "";
      maxCrawlDepth = 0;
      filePath = "";

      url = args[0];
      try {
         URL testUrl = new URL(url);
         URLConnection urlConnection = testUrl.openConnection();
      } catch (MalformedURLException e) {
         System.out.println("Not a valid URL");
      } catch (IOException e) {
         System.out.println("Could not open URL");

      try {
         maxCrawlDepth = Integer.parseInt(args[1]);
      } catch (NumberFormatException e) {
         System.out.println("Argument is not an int");

      filePath = args[2];
      File path = new File(filePath);
      if(!path.exists()) {
         System.out.println("File Path is invalid");

      WebPage webpage = new WebPage(url);
      crawling(webpage, 0, maxCrawlDepth);

      System.out.println("Web crawl is complete");



    public class WebPage implements WebElement {

   private static Elements images;
   private static Elements links;

   private HashMap<String, WebImage> webImages = new HashMap<String, WebImage>();
   private HashMap<String, WebPage> webPages = new HashMap<String, WebPage>();
   private HashMap<String, WebFile> files = new HashMap<String, WebFile>();

   private String url;

   public WebPage(String url) {
      this.url = url;

   /* The crawl method parses the html on a given web page
    * and adds the elements of the web page to the Download
    * Repository.
   public void crawl(int currentCrawlDepth) {

      System.out.print("Crawling " + url + " at crawl depth ");
      System.out.println(currentCrawlDepth + "\n");

      Document doc = null;

      try {
         HttpConnection httpConnection = (HttpConnection) Jsoup.connect(url);
         doc = httpConnection.get();

      } catch (MalformedURLException e) {
      } catch (IOException e) {
      } catch (IllegalArgumentException e) {
         System.out.println(url + "is not a valid URL");

      DownloadRepository downloadRepository = DownloadRepository.getInstance();

      if(doc != null) {
         images = doc.select("img");
         links = doc.select("a[href]");

         for(Element image : images) {
            String imageUrl = image.absUrl("src");
            if(!webImages.containsValue(image)) {
               WebImage webImage = new WebImage(imageUrl);
               webImages.put(imageUrl, webImage);
               downloadRepository.addElement(imageUrl, webImage);
               System.out.println("Added image at " + imageUrl);

         HttpConnection mimeConnection = null;
         Response mimeResponse = null;

         for(Element link: links) {
            String linkUrl = link.absUrl("href");
            linkUrl = linkUrl.trim();
            if(!linkUrl.contains("#")) {
               try {
                  mimeConnection = (HttpConnection) Jsoup.connect(linkUrl);
                  mimeResponse = (Response) mimeConnection.execute();
               } catch (Exception e) {

               String contentType = null;
               if(mimeResponse != null) {
                  contentType = mimeResponse.contentType();

               if(contentType == null) {
               if(contentType.toString().equals("text/html")) {
                  if(!webPages.containsKey(linkUrl)) {
                     WebPage webPage = new WebPage(linkUrl);
                     webPages.put(linkUrl, webPage);
                     downloadRepository.addElement(linkUrl, webPage);
                     System.out.println("Added webPage at " + linkUrl);
               else {
                  if(!files.containsValue(link)) {
                     WebFile webFile = new WebFile(linkUrl);
                     files.put(linkUrl, webFile);
                     downloadRepository.addElement(linkUrl, webFile);
                     System.out.println("Added file at " + linkUrl);



      System.out.print("\nFinished crawling " + url + " at crawl depth ");
      System.out.println(currentCrawlDepth + "\n");

   public HashMap<String, WebImage> getImages() {
      return webImages;

   public HashMap<String, WebPage> getCrawledWebPages() {
      return webPages;

   public HashMap<String, WebFile> getFiles() {
      return files;

   public String getUrl() {
      return url;

   public void saveToDisk(String filePath) {



Crawling https://www.google.com/ at crawl depth 0

Added webPage at http://www.google.com/intl/en/options/
Added webPage at https://www.google.com/intl/en/ads/
Added webPage at https://www.google.com/services/
Added webPage at https://www.google.com/intl/en/about.html
Added webPage at https://www.google.com/intl/en/policies/
Finished crawling https://www.google.com/ at crawl depth 0

Crawling https://www.google.com/services/ at crawl depth 1

Added webPage at http://www.google.com/intl/en/enterprise/apps/business/?utm_medium=et&utm_campaign=en&utm_source=us-en-et-nelson_bizsol
Added webPage at https://www.google.com/services/sitemap.html
Added webPage at https://www.google.com/intl/en/about/
Added webPage at https://www.google.com/intl/en/policies/
Finished crawling https://www.google.com/services/ at crawl depth 1

**Crawling https://www.google.com/intl/en/policies/ at crawl depth 2**

Added webPage at https://www.google.com/intl/en/policies/
Added webPage at https://www.google.com/intl/en/policies/terms/
Added webPage at https://www.google.com/intl/en/policies/privacy/
Added webPage at https://www.google.com/intl/en/policies/terms/
Added webPage at https://www.google.com/intl/en/policies/faq/
Added webPage at https://www.google.com/intl/en/policies/technologies/
Added webPage at https://www.google.com/intl/en/about/
Added webPage at https://www.google.com/intl/en/policies/

Finished crawling https://www.google.com/intl/en/policies/ at crawl depth 2

**Crawling https://www.google.com/intl/en/policies/ at crawl depth 3**


1 个答案:

答案 0 :(得分:1)




public class WebCrawler {

    private HashMap<String, WebPage> visited = new HashMap<String, WebPage>();

    public static void crawling(Map<String, WebPage> visited, WebPage webpage, int currentCrawlDepth, int maxCrawlDepth) {



public class Visited {

    private HashMap<String, WebPage> webPages = new HashMap<String, WebPage>();

    public boolean visit(String url, WebPage page) {
        if (webPages.containsKey(page)) {
            return false;
        webPages.put(url, page);
        return true;

    private HashMap<String, WebImage> webImages = new HashMap<String, WebImage>();

    public boolean visit(String url, WebImage image) {
        if (webImages.containsKey(image)) {
            return false;
        webImages.put(url, image);
        return true;
