CasperJS evaluate()不是从每个()块内执行的

时间:2015-04-20 20:10:40

标签: javascript web-crawler phantomjs casperjs

所以我正在抓取一个页面,收集链接,然后我想抓取这些链接来完成我的数据集。这是一些代码:

crawl.js:

var casper = require("casper").create({
    waitTimeout: 3000,
    pageSettings: {
        userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:23.0) Gecko/20130404 Firefox/23.0"
    },
    clientScripts: ["includes/jquery.min.js"],
    verbose: true
});

var followers = require('./followers');
var currentPage = 1;
var x = require('casper').selectXPath;

Object.size = function(obj) {
    var size = 0, key;
    for (key in obj) {
        if (obj.hasOwnProperty(key)) size++
    }
    return size;
};

var collectFollowers = function() {
    var url;
    this.echo("capturing page " + currentPage);
    this.capture("wowhead-p" + currentPage + ".png");

    // don't go too far down the rabbit hole
    if (currentPage >= 5 || !this.exists(x('//*[text()="Next ›"]'))) {
        processFollowers.call(casper);
        return terminate.call(casper);
    }

    currentPage++;
    this.echo("requesting next page: " + currentPage);
    url = this.getCurrentUrl();

    var links = this.evaluate(function() {
        var obj = {}
        $('.listview-cleartext').map(function(){ 
            obj[$(this).text()] = $(this).attr('href');
        });
        return obj;
    });

    for (key in links) {
        followers.followers[key] = links[key];
    }

    this.echo("Page links: " + Object.size(followers.followers));
    //this.emit('update.followers', links);
    this.thenClick(x('//*[text()="Next ›"]')).then(function() {
        this.waitFor(function() {
            return url !== this.getCurrentUrl();
        }, collectFollowers, processFollowers);
    });
};

var processFollowers = function() {
    this.echo("Total followers:" + Object.size(followers.followers));
    this.each(Object.keys(followers.followers), function(casper, key) {
        this.thenOpen('http://wowhead.com' + followers.followers[key]).then(function() {
            this.echo("On http://wowhead.com" + followers.followers[key]);
            this.evaluate(function() {
                this.echo("Inside the evaluate statement.");
                if ($('a[href=#quests]').length) {
                    this.echo("Has quest!");
                    $('a[href=#quests]').click();
                    var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
                    var questName = $('#tab-quests').show().find('.listview-cleartext').text();
                    this.echo("Quest URL: " + questURL);
                    followers.followers[key] = {"name": key, "quest": {"url": questURL, "name": questName}};
                } else {
                    this.echo("Does not have quest!");
                }    
            });
        });
    });
}

var terminate = function() {
    this.echo("Done.").exit();
}

casper.start("http://wowhead.com/followers=2");
casper.waitForSelector(x('//*[text()="Next ›"]'), collectFollowers, processFollowers);
casper.run();

followers.js:

var require = patchRequire(require);
var utils = require('utils');
var followers = {};
exports.followers = followers;

关注者用于存储全局变量,这是我在抓取页面时不断构建和更新的对象。所以我浏览了3页数据,成功收集链接,然后开始处理它们。就目前而言,CasperJS似乎成功打开了每个页面,但是从不调用evaluate函数。

我能够在PhantomJS中使用某些异步逻辑来使用此功能,但是切换到了casper,因为它看起来好像会在引擎盖下进行处理。我尝试过thenOpen(),then()和open(),thenOpen()的各种组合,没有then()等等。我搞砸了什么?

1 个答案:

答案 0 :(得分:0)

casper.evalute()是沙盒页面上下文,与PhantomJS版本(page.evaluate())的方式相同。它无法访问外部定义的变量。

this内{p> evaluate()引用window而非casper,我怀疑有window.echo()之类的功能。如果要从页面上下文接收控制台消息,则需要注册remote.message事件:

casper.on("remote.message", function(msg){
    this.echo("remote: " + msg);
});

您必须明确地将结果传递出页面上下文并将其添加到那里:

var result = this.evaluate(function() {
    console.log("Inside the evaluate statement.");
    if ($('a[href=#quests]').length) {
        console.log("Has quest!");
        $('a[href=#quests]').click();
        var questURL = $('#tab-quests').show().find('.listview-cleartext').attr('href');
        var questName = $('#tab-quests').show().find('.listview-cleartext').text();
        console.log("Quest URL: " + questURL);
        return {"url": questURL, "name": questName}};
    } else {
        console.log("Does not have quest!");
        return null;
    }
});
if (result) {
    followers.followers[key] = {name: key, quest: result};
}