所以我正在抓取一个页面,收集链接,然后我想抓取这些链接来完成我的数据集。这是一些代码:
crawl.js:
var casper = require("casper").create({
waitTimeout: 3000,
pageSettings: {
userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0"
},
clientScripts: ["includes/jquery.min.js"],
verbose: true
});
var followers = require('./followers');
var currentPage = 1;
var x = require('casper').selectXPath;
Object.size = function(obj) {
var size = 0, key;
for (key in obj) {
if (obj.hasOwnProperty(key)) size++
}
return size;
};
var collectFollowers = function() {
var url;
this.echo("capturing page " + currentPage);
this.capture("wowhead-p" + currentPage + ".png");
// don't go too far down the rabbit hole
if (currentPage >= 5 || !this.exists(x('//*[text()="Next ›"]'))) {
processFollowers.call(casper);
return terminate.call(casper);
}
currentPage++;
this.echo("requesting next page: " + currentPage);
url = this.getCurrentUrl();
var links = this.evaluate(function() {
var obj = {}
$('.listview-cleartext').map(function(){
obj[$(this).text()] = $(this).attr('href');
});
return obj;
});
for (key in links) {
followers.followers[key] = links[key];
}
this.echo("Page links: " + Object.size(followers.followers));
//this.emit('update.followers', links);
this.thenClick(x('//*[text()="Next ›"]')).then(function() {
this.waitFor(function() {
return url !== this.getCurrentUrl();
}, collectFollowers, processFollowers);
});
};
var processFollowers = function() {
this.echo("Total followers:" + Object.size(followers.followers));
this.each(Object.keys(followers.followers), function(casper, key) {
this.thenOpen('http://wowhead.com' + followers.followers[key]).then(function() {
this.echo("On http://wowhead.com" + followers.followers[key]);
this.evaluate(function() {
this.echo("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
this.echo("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
this.echo("Quest URL: " + questURL);
followers.followers[key] = {"name": key, "quest": {"url": questURL, "name": questName}};
} else {
this.echo("Does not have quest!");
}
});
});
});
}
var terminate = function() {
this.echo("Done.").exit();
}
casper.start("http://wowhead.com/followers=2");
casper.waitForSelector(x('//*[text()="Next ›"]'), collectFollowers, processFollowers);
casper.run();
followers.js:
var require = patchRequire(require);
var utils = require('utils');
var followers = {};
exports.followers = followers;
关注者用于存储全局变量,这是我在抓取页面时不断构建和更新的对象。所以我浏览了3页数据,成功收集链接,然后开始处理它们。就目前而言,CasperJS似乎成功打开了每个页面,但是从不调用evaluate函数。
我能够在PhantomJS中使用某些异步逻辑来使用此功能,但是切换到了casper,因为它看起来好像会在引擎盖下进行处理。我尝试过thenOpen(),then()和open(),thenOpen()的各种组合,没有then()等等。我搞砸了什么?
答案 0 :(得分:0)
casper.evalute()
是沙盒页面上下文,与PhantomJS版本(page.evaluate()
)的方式相同。它无法访问外部定义的变量。
this
内{p> evaluate()
引用window
而非casper
,我怀疑有window.echo()
之类的功能。如果要从页面上下文接收控制台消息,则需要注册remote.message
事件:
casper.on("remote.message", function(msg){
this.echo("remote: " + msg);
});
您必须明确地将结果传递出页面上下文并将其添加到那里:
var result = this.evaluate(function() {
console.log("Inside the evaluate statement.");
if ($('a[href=#quests]').length) {
console.log("Has quest!");
$('a[href=#quests]').click();
var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
var questName = $('#tab-quests').show().find('.listview-cleartext').text();
console.log("Quest URL: " + questURL);
return {"url": questURL, "name": questName}};
} else {
console.log("Does not have quest!");
return null;
}
});
if (result) {
followers.followers[key] = {name: key, quest: result};
}