在casperjs中打开多个链接

时间:2016-01-26 23:47:06

标签: web-scraping phantomjs casperjs

我正试图从这个网站http://www.basketball-reference.com/teams/GSW/2016_games.html抓取所有特殊链接(boxscore-links),然后逐个访问它们,从每个访问过的链接中抓取一些信息。首先,我想抓取所有链接,逐一访问它们并获得网站标题。问题是它总是打印相同的标题和相同的当前URL(初始URL),即使它显然必须是一个新的。在我看来,这个' -keyword存在问题...... (不要看链接的限制,我从casperjs的github上的示例中获取代码,然后我把它留给控制台不要超载。) 这是我的代码:

var casper = require("casper").create({
    verbose: true
});

// The base links array
var links = [ "http://www.basketball-reference.com/teams/GSW/2016_games.html" ];

// If we don't set a limit, it could go on forever
var upTo = ~~casper.cli.get(0) || 10;
var currentLink = 0;

// Get the links, and add them to the links array
function addLinks(link) {
    this.then(function() {
        var found = this.evaluate(searchLinks);
        this.echo(found.length + " links found on " + link);
        links = links.concat(found);
    });
}

// Fetch all <a> elements from the page and return
// the ones which contains a href starting with 'http://'

function searchLinks() {
    var links = document.querySelectorAll('#teams_games td:nth-child(5) a');
    return Array.prototype.map.call(links, function(e) {
        return e.getAttribute('href');
    });
}

// Just opens the page and prints the title
function start(link) {
    this.start(link, function() {
        this.wait(5000, function() {
            this.echo('Page title: ' + this.getTitle());
            this.echo('Current url: ' + this.getCurrentUrl());
        });
    });
}

// As long as it has a next link, and is under the maximum limit, will keep running
function check() {
    if (links[currentLink] && currentLink < upTo) {
        this.echo('--- Link ' + currentLink + ' ---');
        start.call(this, links[currentLink]);
        addLinks.call(this, links[currentLink]);
        currentLink++;
        this.run(check);
    } else {
        this.echo("All done.");
        this.exit();
    }
}

casper.start().then(function() {
    this.echo("Starting");
});

casper.run(check);

1 个答案:

答案 0 :(得分:1)

考虑一系列网址,您可以迭代它们,连续访问每个网址,如下所示:

casper.each(urls, function(self, url) {
    self.thenOpen(url, function(){
        this.echo('Opening: ' + url);
        // Do Whatever
    });
});

显然,这不会在页面上找到链接,但这是查看一组已知网址的好方法。