我正在使用Grails在Groovy上创建一个爬虫程序。我正在使用Crawler4j并关注this tutorial。
之后我只是通过run-app
运行应用程序,但我在任何地方都没有看到任何抓取数据。
<g:form name="submitWebsite" url="[controller:'BasicCrawlController ']">
吗?我问这个是因为我在这个控制器中没有任何方法,所以它是调用这个控制器的正确方法吗?
我的代码如下:
//All necessary imports
public class BasicCrawlController {
static main(args) throws Exception {
String crawlStorageFolder = "C:/crawl/crawler4jStorage";
int numberOfCrawlers = 1;
//int maxDepthOfCrawling = -1; default
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
config.setMaxPagesToFetch(100);
config.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
controller.start(BasicCrawler.class, 1);
}
}
class BasicCrawler extends WebCrawler {
final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
"|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase()
!FILTERS.matcher(href).matches() && href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
void visit(Page page) {
int docid = page.getWebURL().getDocid()
String url = page.getWebURL().getURL()
String domain = page.getWebURL().getDomain()
String path = page.getWebURL().getPath()
String subDomain = page.getWebURL().getSubDomain()
String parentUrl = page.getWebURL().getParentUrl()
String anchor = page.getWebURL().getAnchor()
println("Docid: ${docid} ")
println("URL: ${url} ")
println("Domain: '${domain}'")
println("Sub-domain: ' ${subDomain}'")
println("Path: '${path}'")
println("Parent page:${parentUrl} ")
println("Anchor text: ${anchor} " )
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
String text = htmlParseData.getText()
String html = htmlParseData.getHtml()
List<WebURL> links = htmlParseData.getOutgoingUrls()
println("Text length: " + text.length())
println("Html length: " + html.length())
println("Number of outgoing links: " + links.size())
}
Header[] responseHeaders = page.getFetchResponseHeaders()
if (responseHeaders != null) {
println("Response headers:")
for (Header header : responseHeaders) {
println("\t ${header.getName()} : ${header.getValue()}")
}
}
println("=============")
}
}
答案 0 :(得分:2)
我会尝试将您的代码翻译成Grails标准。
在grails-app/controller
class BasicCrawlController {
def index() {
String crawlStorageFolder = "C:/crawl/crawler4jStorage";
int numberOfCrawlers = 1;
//int maxDepthOfCrawling = -1; default
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
crawlConfig.setPolitenessDelay(1000);
crawlConfig.setMaxPagesToFetch(100);
crawlConfig.setResumableCrawling(false);
PageFetcher pageFetcher = new PageFetcher(crawlConfig);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
controller.addSeed("http://en.wikipedia.org/wiki/Web_crawler")
controller.start(BasicCrawler.class, 1);
render "done crawling"
}
}
在src/groovy
class BasicCrawler extends WebCrawler {
final static Pattern FILTERS = Pattern
.compile(".*(\\.(css|js|bmp|gif|jpe?g"+ "|png|tiff?|mid|mp2|mp3|mp4" +
"|wav|avi|mov|mpeg|ram|m4v|pdf" +"|rm|smil|wmv|swf|wma|zip|rar|gz))\$")
/**
* You should implement this function to specify whether the given url
* should be crawled or not (based on your crawling logic).
*/
@Override
boolean shouldVisit(WebURL url) {
String href = url.getURL().toLowerCase()
!FILTERS.matcher(href).matches() && href.startsWith("http://en.wikipedia.org/wiki/Web_crawler/")
}
/**
* This function is called when a page is fetched and ready to be processed
* by your program.
*/
@Override
void visit(Page page) {
int docid = page.getWebURL().getDocid()
String url = page.getWebURL().getURL()
String domain = page.getWebURL().getDomain()
String path = page.getWebURL().getPath()
String subDomain = page.getWebURL().getSubDomain()
String parentUrl = page.getWebURL().getParentUrl()
String anchor = page.getWebURL().getAnchor()
println("Docid: ${docid} ")
println("URL: ${url} ")
println("Domain: '${domain}'")
println("Sub-domain: ' ${subDomain}'")
println("Path: '${path}'")
println("Parent page:${parentUrl} ")
println("Anchor text: ${anchor} " )
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData()
String text = htmlParseData.getText()
String html = htmlParseData.getHtml()
List<WebURL> links = htmlParseData.getOutgoingUrls()
println("Text length: " + text.length())
println("Html length: " + html.length())
println("Number of outgoing links: " + links.size())
}
Header[] responseHeaders = page.getFetchResponseHeaders()
if (responseHeaders != null) {
println("Response headers:")
for (Header header : responseHeaders) {
println("\t ${header.getName()} : ${header.getValue()}")
}
}
println("=============")
}
}