Question

以下代码是用CasperJS编写的简单刮刀。

var casper = require('casper').create();

var url = casper.cli.get(0);
var page1 = casper.cli.get(1);
var page2 = casper.cli.get(2);
//console.log(page2);
var proxy = casper.cli.get(3);

//alert(page1);

var exp = /[-a-zA-Z0-9@:%_\+.~#?&//=]{2,256}\.[a-z]{2,4}\b(\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?/gi;
var regex = new RegExp(exp);

var baseUrl = url;

//console.log(baseUrl);

var nextBtn = "a.navigation-button.next";

var allLinks = [];

casper.start(baseUrl);

casper.waitForSelector(nextBtn, processPage);

casper.run();

function processPage() {
  for (var i = page1; i < page2; i = i + 1) {
      console.log(i);
    var pageData = this.evaluate(getPageData);
    allLinks = allLinks.concat(pageData);



  if (!this.exists(nextBtn)) {
    return;
  };

  this.thenClick(nextBtn).then(function() {
      //this.echo(i);
    this.echo(this.getCurrentUrl());
    //this.wait(1000);
  });
};
}

function getPageData(){
  //return document.title;

  var links = document.getElementsByClassName('pro-title');
  links = Array.prototype.map.call(links,function(link){
    return link.getAttribute('href');
  });
  return links;
};


casper.then(function(){
  //require('utils').dump(allLinks);
  this.each(allLinks,function(self,link){
      if (link.match(regex)) {
    self.thenOpen(link,function(a){
      jsonObj = {};
      jsonObj.title = this.fetchText('a.profile-full-name');

      jsonObj.services = this.getHTML('div.info-list-text span:nth-child(2) span');
      jsonObj.services = jsonObj.services.replace(/&amp;/g,"and");  

      jsonObj.location = this.getHTML('div.pro-info-horizontal-list div.info-list-label:nth-child(3) div.info-list-text span');
      //jsonObj.contact = this.fetchText('span.pro-contact-text');
      jsonObj.description = this.getHTML('div.profile-about div:nth-child(1)');  
      //jsonObj.description.replace(/\s/g, '');   

      //require('utils').dump(jsonObj);
      //jsonObj.description = jsonObj.description.replace(/[\t\n]/g,"");   

      //jsonObj = JSON.stringify(jsonObj, null, '\t');
      //console.log(i);
      require('utils').dump(jsonObj);
    });
      };
  });
});

我正在按如下方式执行此脚本，

casperjs scraping.js http://www.houzz.com/professionals/c/Chicago--IL/p/15 1 3

第一个CLI参数是起始URL。第二个和第三个参数是scrape的起始和结束页码。

我能够从第一页提取数据，但我不明白为什么我无法从任何后续页面中提取数据。

Answer 1

您不能在processPage中混合使用这样的同步和异步代码。循环立即执行，但单击和下一页的加载是异步发生的。页面的评估必须异步完成：

function processPage() {
    for (var i = page1; i < page2; i = i + 1) {
        this.then(function(){
            console.log(i);
            var pageData = this.evaluate(getPageData);
            allLinks = allLinks.concat(pageData);

            if (!this.exists(nextBtn)) {
                return;
            }

            this.thenClick(nextBtn).then(function() {
                this.echo(this.getCurrentUrl());
            });
        });
    };
}

CasperJS - 刮刀无法导航到下一页

1 个答案: