CasperJS循环没有正确评估,显然只是直接运行

时间:2014-08-21 08:28:10

标签: javascript screen-scraping casperjs

我有以下代码:

        function getLinks() {
        var links = document.querySelectorAll('a.row-link');
        return Array.prototype.map.call(links, function(e) {
            return e.getAttribute('href');
        });
    }


    casper.start('http://somedomain.com/board/search/search/?p=' + pagee);

   // Get all the pages here! 
   casper.then(function(){
        while(pagenos = nextpagereg.exec(this.getHTML())) {;
              this.echo(pagenos);
              pages.push(pagenos);


        }
        this.echo(pages.length + ' PAGES FOUND');
        return pages;
    });


    casper.then(function() {
         pagee = 0;

        links = this.evaluate(getLinks);
        this.echo(links.length + ' links found:');


        this.each(pages, function(self,page){
          pagee++;
          this.echo('WORKING ON PAGE' + pagee);
          self.thenOpen('http://somedomain.com/board/search/search/?p=' + pagee, function(self){
           this.each(links, function(self, link){

            self.thenOpen(link, function(self, a){

              title = this.getHTML('h2#job-title').trim();
              if(casper.exists('p#job-subtitle a')){
                company = this.getHTML('p#job-subtitle a');
              } else {
                title = "NA"; 
              }

              loc = this.getHTML('p#job-subtitle>strong');
              email = regex.exec(this.getHTML());
              this.echo("Title : " + title);
              this.echo("Company : " + company);
              this.echo("Location : " + loc);
              this.echo("Email : " + email);


              this.echo("************************************************************************************************************");


            });
            //this.echo(link);

         });

        });
        });

    });

    casper.run(function() {
        this.exit();
    });

问题似乎是它没有按顺序遍历页面,外部循环只是在所有页面链接中崩溃!

修改

为了澄清这是输出,因为你可以看到循环在处理每一页之前拍摄所有页面增量:

    http://somedomain.com/job-board/search/@/?p=2
    http://somedomain.com/job-board/search/search/?p=3
    http://somedomain.com/job-board/search/search/?p=4
    http://somedomain.com/job-board/search/search/?p=5
    http://somedomain.com/job-board/search/search/?p=6
    5 PAGES FOUND
    50 links found in page 1:
 LOOP START
    WORKING ON PAGE1
    WORKING ON PAGE2
    WORKING ON PAGE3
    WORKING ON PAGE4
    WORKING ON PAGE5

1 个答案:

答案 0 :(得分:1)

看起来您需要抓取五个页面中每个页面上的链接,但是在开始迭代之前只执行一次。

你应该添加

links = this.evaluate(getLinks);
this.echo(links.length + ' links found:');

self.thenOpen('http://somedomain.com/board/search/search/?p=' + pagee, ...之前的each

更简洁的方法是移动 getLinks进入self.thenOpen并交换最顶层的

this.each(pages, function(self,page){
    ...
});

for(var page = 0; page < pages.length; page++){
    ...
}