从页面上点击的链接中截取数据,然后转到下一页&在CasperJS中重复

时间:2016-03-30 11:46:00

标签: javascript pagination web-scraping phantomjs casperjs

在递归处理页面上的链接之后,我很难让casperjs继续下一页。

我可以让它从每个页面获取数据并浏览页面,或点击页面上的每个链接,但我不能同时执行这两个链接。

var utils = require('utils');
var x = require('casper').selectXPath;

var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  waitTimeout: 10000,
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36     (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  }
});

var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];


var terminate = function() {
    this.echo("Exiting..").exit();
};

function getSelectedPage() {
    var el = document.querySelector('td.cur');
    return parseInt(el.textContent);
}

function getPageLinks () {
  var links = document.querySelectorAll('h3.r a');
  return [].map.call(links, function(link) {
    return link.getAttribute('href');
  });
}


function getLinkData(link) {
  this.thenOpen(link, function() {


    var title = this.getTitle();


    // Add the data from link
    var data = {
      title: title,
    };
    link_titles.push(data);

  });
}

function loopThroughLinks() {  

 if( i < links.length) {
    this.echo('[LINK #' + i + '] '+ links[i]);
    getLinkData.call(this, links[i]);
    i++;
    this.run(loopThroughLinks);

 }  else {
    utils.dump(link_titles);
    }       
}


function linkData(){
    links = this.evaluate(getPageLinks);
    this.run(loopThroughLinks);
}


var processPage = function() {  

    this.run(linkData);

    //PROBLEM EXISTS BELOW HERE - IF YOU COMMENT OUT FROM HERE IT RUNS AS EXPECTED FOR THE FIRST PAGE
    //WITH CODE BELOW INCLUDED, SKIPS this.run(linkData) AND JUST GOES THROUGH PAGES;
    this.then(function(){

    if (currentPage >= 3) {
        return terminate.call(casper);
    }

    currentPage++;

    this.echo("requesting next page: " + currentPage);
    this.capture("google-results-p" + currentPage + ".png");

    this.thenClick('a.pn span').then(function(){
        this.waitFor(function(){
            return currentPage === this.evaluate(getSelectedPage);

        }, processPage, terminate);
     }); 
  });   //COMMENT OUT TO HERE FOR WORKING ONE PAGE VERSION
}


casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');

casper.run(processPage);

更新了反映多次运行调用的代码。现在循环浏览第一页,但是从第一页打印所有其他页面的结果??

var utils = require('utils');
var x = require('casper').selectXPath;

var casper = require('casper').create({
  verbose: true,
  logLevel: 'error',
  waitTimeout: 10000,
  pageSettings: {
    loadImages: false,
    loadPlugins: false,
    userAgent: 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36'
  }
});

var currentPage = 1;
var i = 0;
var links = [];
var link_titles = [];


var terminate = function() {
    this.echo("Exiting..").exit();
};

function getSelectedPage() {
    var el = document.querySelector('td.cur');
    return parseInt(el.textContent);
}

function getPageLinks() {
    var links = document.querySelectorAll("h3.r a");
    return Array.prototype.map.call(links, function(e) {
        try {
            // google handles redirects hrefs to some script of theirs
            return (/url\?q=(.*)&sa=U/).exec(e.getAttribute("href"))[1];
        } catch (err) {
            return e.getAttribute("href");
        }
    });
} 

function getLinkData(link) {
  this.thenOpen(link, function() {

    //var title = this.fetchText('title');
    var title = this.getTitle();


    // Add the staff data from link
    var data = {
      title: title,
    };
    link_titles.push(data);

    this.then(function(){  ///ADDED - BACK TO RIGHT PAGE FOR SELECTOR
    this.back();
    });

  });
}

function loopThroughLinks() {  

 if( i < links.length) {
    this.echo('[LINK #' + i + '] '+ links[i]);
    getLinkData.call(this, links[i]);
    i++;
    this.then(loopThroughLinks);

 }  else {
    utils.dump(link_titles);
    }       
}


function linkData(){
    links = this.evaluate(getPageLinks);
    this.then(loopThroughLinks);
}


var processPage = function() {  

    this.wait(2000, function(){
    this.then(linkData);
    });


    this.wait(2000, function(){
    this.then(function(){

    if (currentPage >= 3) {
        return terminate.call(casper);
    }


    this.echo("requesting next page: " + currentPage);
    this.capture("google-results-p" + currentPage + ".png");


    currentPage++;


    this.thenClick('a.pn span').then(function(){
        this.capture('google-results-2-p' + currentPage + '.png');
        this.waitFor(function(){
            return currentPage === this.evaluate(getSelectedPage);

        }, processPage, terminate);
     }); 
  });
 });
}


casper.start('https://www.google.co.uk/?gws_rd=ssl#q=casperjs');

casper.then(processPage);

casper.run();

1 个答案:

答案 0 :(得分:0)

您必须只有一个public class UriAdpter extends TypeAdapter<Uri> { @Override public void write(JsonWriter out, Uri value) throws IOException { out.value(value.toString()); } @Override public Uri read(JsonReader in) throws IOException { if (in.peek() == JsonToken.NULL) { in.nextNull(); return null; } return Uri.parse(in.nextString()); } } (并且只有一个casper.run())来电。 casper.start()启动CasperJS步骤队列,如果没有进一步的步骤,将完成执行。唯一需要保留的呼叫是run(),但所有其他casper.run(processPage);呼叫需要更改为this.run(...)