Crawler4j与Grails App

时间:2014-06-26 20:32:52

标签: grails groovy crawler4j

我正在使用Grails在Groovy上创建一个爬虫程序。我正在使用Crawler4j并关注this tutorial

  1. 我创建了一个新的grails项目
  2. 将BasicCrawlController.groovy文件放在controllers-> package
  3. 没有创建任何视图,因为我预期在运行应用时,我的抓取数据会出现在我的 crawlStorageFolder 中(如果我的理解存在缺陷,请纠正我)
  4. 之后我只是通过run-app运行应用程序,但我在任何地方都没有看到任何抓取数据。

    1. 我是否正确期望在crawlStorageFolder位置创建一些文件,我将其作为C:/ crawl / crawler4jStorage提供?
    2. 我是否需要为此创建任何视图?
    3. 如果我想点击表单的提交按钮从其他视图调用此搜寻器控制器,我可以写<g:form name="submitWebsite" url="[controller:'BasicCrawlController ']">吗?
    4. 我问这个是因为我在这个控制器中没有任何方法,所以它是调用这个控制器的正确方法吗?

      我的代码如下:

      //All necessary imports  
      
      
      
          public class BasicCrawlController {
              static main(args) throws Exception {
                  String crawlStorageFolder = "C:/crawl/crawler4jStorage";
                  int numberOfCrawlers = 1;
                  //int maxDepthOfCrawling = -1;    default
                  CrawlConfig config = new CrawlConfig();
                  config.setCrawlStorageFolder(crawlStorageFolder);
                  config.setPolitenessDelay(1000);
                  config.setMaxPagesToFetch(100);
                  config.setResumableCrawling(false);
                  PageFetcher pageFetcher = new PageFetcher(config);
                  RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
                  RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
                  CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
                  controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
                  controller.start(BasicCrawler.class, 1);
      
              }
          }
      
      
          class BasicCrawler extends WebCrawler {
      
          final static Pattern FILTERS = Pattern
          .compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
                   "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")
      
          /**
           * You should implement this function to specify whether the given url
           * should be crawled or not (based on your crawling logic).
           */
          @Override
          boolean shouldVisit(WebURL url) {
              String href = url.getURL().toLowerCase()
              !FILTERS.matcher(href).matches() &&       href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
          }
      
          /**
           * This function is called when a page is fetched and ready to be processed
           * by your program.
           */
          @Override
          void visit(Page page) {
              int docid = page.getWebURL().getDocid()
              String url = page.getWebURL().getURL()
              String domain = page.getWebURL().getDomain()
              String path = page.getWebURL().getPath()
              String subDomain = page.getWebURL().getSubDomain()
              String parentUrl = page.getWebURL().getParentUrl()
              String anchor = page.getWebURL().getAnchor()
      
              println("Docid: ${docid} ")
              println("URL: ${url}  ")
              println("Domain: '${domain}'")
              println("Sub-domain: ' ${subDomain}'")
              println("Path: '${path}'")
              println("Parent page:${parentUrl}  ")
              println("Anchor text: ${anchor} " )
      
              if (page.getParseData() instanceof HtmlParseData) {
                  HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
                  String text = htmlParseData.getText()
                  String html = htmlParseData.getHtml()
                  List<WebURL> links = htmlParseData.getOutgoingUrls()
      
                  println("Text length: " + text.length())
                  println("Html length: " + html.length())
                  println("Number of outgoing links: " + links.size())
              }
              Header[] responseHeaders = page.getFetchResponseHeaders()
              if (responseHeaders != null) {
                  println("Response headers:")
                  for (Header header : responseHeaders) {
                      println("\t ${header.getName()} : ${header.getValue()}")
                  }
              }
              println("=============")
          }
      }
      

1 个答案:

答案 0 :(得分:2)

我会尝试将您的代码翻译成Grails标准。

grails-app/controller

下使用此功能
class BasicCrawlController {

   def index() {
        String crawlStorageFolder = "C:/crawl/crawler4jStorage";
        int numberOfCrawlers = 1;
        //int maxDepthOfCrawling = -1;    default
        CrawlConfig crawlConfig = new CrawlConfig();
        crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
        crawlConfig.setPolitenessDelay(1000);
        crawlConfig.setMaxPagesToFetch(100);
        crawlConfig.setResumableCrawling(false);
        PageFetcher pageFetcher = new PageFetcher(crawlConfig);
        RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
        RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
        CrawlController controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
        controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
        controller.start(BasicCrawler.class, 1);

        render "done crawling"

    }
}

src/groovy

下使用此功能
class BasicCrawler extends WebCrawler {

final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
         "|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")

/**
 * You should implement this function to specify whether the given url
 * should be crawled or not (based on your crawling logic).
 */
@Override
boolean shouldVisit(WebURL url) {
    String href = url.getURL().toLowerCase()
    !FILTERS.matcher(href).matches() &&       href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}

/**
 * This function is called when a page is fetched and ready to be processed
 * by your program.
 */
@Override
void visit(Page page) {
    int docid = page.getWebURL().getDocid()
    String url = page.getWebURL().getURL()
    String domain = page.getWebURL().getDomain()
    String path = page.getWebURL().getPath()
    String subDomain = page.getWebURL().getSubDomain()
    String parentUrl = page.getWebURL().getParentUrl()
    String anchor = page.getWebURL().getAnchor()

    println("Docid: ${docid} ")
    println("URL: ${url}  ")
    println("Domain: '${domain}'")
    println("Sub-domain: ' ${subDomain}'")
    println("Path: '${path}'")
    println("Parent page:${parentUrl}  ")
    println("Anchor text: ${anchor} " )

    if (page.getParseData() instanceof HtmlParseData) {
        HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
        String text = htmlParseData.getText()
        String html = htmlParseData.getHtml()
        List<WebURL> links = htmlParseData.getOutgoingUrls()

        println("Text length: " + text.length())
        println("Html length: " + html.length())
        println("Number of outgoing links: " + links.size())
    }
    Header[] responseHeaders = page.getFetchResponseHeaders()
    if (responseHeaders != null) {
        println("Response headers:")
        for (Header header : responseHeaders) {
            println("\t ${header.getName()} : ${header.getValue()}")
        }
    }
    println("=============")
  }
}