我正在构建一个简单的Web scraper。这是我正在抓的网站 - http://www.home.com/pro/c/oho,-NI。我的抓取工具点击了每个名称为pro-title
的链接,并从其输入的网页中提取数据(例如:http://www.me.com/pro/home/marcelle-services)
var casper = require('casper').create({
logLevel:"verbose",
debug:true
});
var jsonObj = {};
var links;
var name;
var paragraph;
var contact;
var description;
var location;
var expression = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(expression);
casper.start('http://www.home.com/ro/c/oho,-TN');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
if (link.match(regex)) {
self.thenOpen(link,function(a){
var location = this.fetchText('div.info-list-text');
//var location = document.querySelectorAll("div.info-list-text")[1];
var contact = this.fetchText('span.pro-contact-text');
var description = this.fetchText('div.profile-about div');
this.echo(location);
//this.echo(contact);
//this.echo(description);
});
}
});
});
casper.run(function(){
this.exit();
});
上面的代码产生了这个输出,
Professionals
Interior Decorators
Contact: GuilbeauLocation: 5007 Wyoming Ave.Nowoah, MI 45786
我想省略div.info-list-text a span:first
选项,以便不记录单词Professionals
。
答案 0 :(得分:1)
可能还包括jQuery,以便在选择元素时让您的生活更轻松。一个解决方案:
var casper = require('casper').create({
logLevel:"verbose",
debug:true,
clientScripts: ['jquery.js']
});
var jsonObj = {};
var links;
var name;
var paragraph;
var contact;
var description;
var location;
var expression = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(expression);
casper.start('http://www.houzz.com/professionals/c/Nashville,-TN');
casper.then(function getLinks(){
links = this.evaluate(function(){
var links = document.getElementsByClassName('pro-title');
links = Array.prototype.map.call(links,function(link){
return link.getAttribute('href');
});
return links;
});
});
casper.then(function(){
this.each(links,function(self,link){
if (link.match(regex)) {
self.thenOpen(link,function(a){
// I just manually extracted the stuff you wanted with jquery selectors
var txtYouWant = casper.evaluate(function() {
var desiredText = $($("div.info-list-text").first().find("span a span")[1]).text();
desiredText += $($("div.info-list-text")[1]).text();
desiredText += $($("div.info-list-text")[2]).text();
return desiredTxt;
});
});
}
});
});
编辑:
请务必修复此部分:
var casper = require('casper').create({
logLevel:"verbose",
debug:true,
clientScripts: ['jquery.js']
});