我有以下代码:
function getLinks() {
var links = document.querySelectorAll('a.row-link');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
casper.start('http://somedomain.com/board/search/search/?p=' + pagee);
// Get all the pages here!
casper.then(function(){
while(pagenos = nextpagereg.exec(this.getHTML())) {;
this.echo(pagenos);
pages.push(pagenos);
}
this.echo(pages.length + ' PAGES FOUND');
return pages;
});
casper.then(function() {
pagee = 0;
links = this.evaluate(getLinks);
this.echo(links.length + ' links found:');
this.each(pages, function(self,page){
pagee++;
this.echo('WORKING ON PAGE' + pagee);
self.thenOpen('http://somedomain.com/board/search/search/?p=' + pagee, function(self){
this.each(links, function(self, link){
self.thenOpen(link, function(self, a){
title = this.getHTML('h2#job-title').trim();
if(casper.exists('p#job-subtitle a')){
company = this.getHTML('p#job-subtitle a');
} else {
title = "NA";
}
loc = this.getHTML('p#job-subtitle>strong');
email = regex.exec(this.getHTML());
this.echo("Title : " + title);
this.echo("Company : " + company);
this.echo("Location : " + loc);
this.echo("Email : " + email);
this.echo("************************************************************************************************************");
});
//this.echo(link);
});
});
});
});
casper.run(function() {
this.exit();
});
问题似乎是它没有按顺序遍历页面,外部循环只是在所有页面链接中崩溃!
修改
为了澄清这是输出,因为你可以看到循环在处理每一页之前拍摄所有页面增量:
http://somedomain.com/job-board/search/@/?p=2
http://somedomain.com/job-board/search/search/?p=3
http://somedomain.com/job-board/search/search/?p=4
http://somedomain.com/job-board/search/search/?p=5
http://somedomain.com/job-board/search/search/?p=6
5 PAGES FOUND
50 links found in page 1:
LOOP START
WORKING ON PAGE1
WORKING ON PAGE2
WORKING ON PAGE3
WORKING ON PAGE4
WORKING ON PAGE5
答案 0 :(得分:1)
看起来您需要抓取五个页面中每个页面上的链接,但是在开始迭代之前只执行一次。
你应该添加
links = this.evaluate(getLinks);
this.echo(links.length + ' links found:');
在self.thenOpen('http://somedomain.com/board/search/search/?p=' + pagee, ...
之前的each
。
更简洁的方法是移动 getLinks
进入self.thenOpen
并交换最顶层的
this.each(pages, function(self,page){
...
});
与
for(var page = 0; page < pages.length; page++){
...
}