This example显示了如何从Google抓取网址。它表明网址会很干净。但是,当我运行此示例时,我的输出如下所示:
20 links found:
- /url?q=http://casperjs.org/&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CBQQFjAA&usg=AFQjCNH321k0JXrSx5WZp-fH6JwxX-O75Q
- /url?q=http://code4fun.fr/tutoriel-casperjs/&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CBoQFjAB&usg=AFQjCNHreU-9mg7OZxK3TOl94HDPOnA_aQ
- /url?q=http://casperjs.readthedocs.org/&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CCEQFjAC&usg=AFQjCNGzX6V5ZQtmCwHwZerHR3ftK3pHOw
- /url?q=https://github.com/n1k0/casperjs&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CCcQFjAD&usg=AFQjCNEiGMDpYiPm1qXK7ZxDCwWwKjAStg
- /url?q=http://www.technologies-ebusiness.com/enjeux-et-tendances/casperjs-pour-des-tests-d-integration&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CC4QFjAE&usg=AFQjCNFOGl1p6ApqP8TmAxhtQp33DHpbcQ
- /url?q=https://www.lullabot.com/blog/article/testing-front-end-casperjs&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CDQQFjAF&usg=AFQjCNG53ZxHl8yZ0JGdzNbwKuZmPOLqCg
- /url?q=http://blog.newrelic.com/2013/06/04/simpler-ui-testing-with-casperjs-2/&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CDoQFjAG&usg=AFQjCNFzlDb7R4Uv-jj_3S5IbJUpKF-7fA
- /url?q=https://www.npmjs.org/package/grunt-casperjs&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CEEQFjAH&usg=AFQjCNGn-dwJpkX_XTQv8YnFZTClcLosJA
- /url?q=http://www.phase2technology.com/blog/behavorial-test-for-custom-entity-using-casperjs/&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CEcQFjAI&usg=AFQjCNFG0KDAADmocesrDoqQTHW6PPO8KQ
- /url?q=http://blog.codeship.io/2013/03/07/smoke-testing-with-casperjs.html&sa=U&ei=_-TuU-KBC83-yQSu5YKQAg&ved=0CE4QFjAJ&usg=AFQjCNG5AsT2iKCnN-utrCGsthCZCpYKaQ
- /url?q=http://phantomjs.org/&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CBQQFjAA&usg=AFQjCNGXz7tw-UkfDOpqvYV89KlcJPGfHQ
- /url?q=http://phantomjs.org/download.html&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CB8QFjAB&usg=AFQjCNG_czKcYiFKskAvoRl1CceXuTJecA
- /url?q=http://www.mathieurobin.com/2013/04/phantomjs-chargez-et-jouez-avec-vos-sites-en-js-sans-quitter-la-console/&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CCUQFjAC&usg=AFQjCNEAtYz0zcsRVYy-37U9sJL7e9EqYQ
- /url?q=http://svay.com/blog/paris-js-10-introduction-a-phantomjs-un-navigateur-webkit-headless/&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CCsQFjAD&usg=AFQjCNE9dUuVQmNpK064a9GPJyOIetUWAA
- /url?q=https://github.com/ariya/phantomjs&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CDIQFjAE&usg=AFQjCNErqnWYxIVwBwXeUjaSd4SFicQqpw
- /url?q=https://github.com/gruntjs/grunt-lib-phantomjs&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CDkQFjAF&usg=AFQjCNHkRVx926JJkKhdoKxKsKVcQc-QTg
- /url?q=http://blog.octo.com/seo-spa-angular/&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CD8QFjAG&usg=AFQjCNFcj-ykUo-rSQKlcEZIy1qjSlW-oQ
- /url?q=https://www.npmjs.org/package/phantomjs&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CEYQFjAH&usg=AFQjCNGweWRdm8qjqxOOybFgtz5B8CnMDQ
- /url?q=http://code.google.com/p/phantomjs/&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CEwQFjAI&usg=AFQjCNEwvNx7NNMDAaiqHZ_y-3Bbf62W_w
- /url?q=http://casperjs.org/&sa=U&ei=_-TuU9yhG4iZyASb8oK4Dw&ved=0CE4QFjAJ&usg=AFQjCNGNEKkl1eWaFx9Sz6R7ZFVN9r1Bhw
这是他们运行的代码:
var links = [];
var casper = require('casper').create();
function getLinks() {
var links = document.querySelectorAll('h3.r a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href');
});
}
casper.start('http://google.fr/', function() {
// search for 'casperjs' from google form
this.fill('form[action="/search"]', { q: 'casperjs' }, true);
});
casper.then(function() {
// aggregate results for the 'casperjs' search
links = this.evaluate(getLinks);
// now search for 'phantomjs' by filling the form again
this.fill('form[action="/search"]', { q: 'phantomjs' }, true);
});
casper.then(function() {
// aggregate results for the 'phantomjs' search
links = links.concat(this.evaluate(getLinks));
});
casper.run(function() {
// echo results in some pretty fashion
this.echo(links.length + ' links found:');
this.echo(' - ' + links.join('\n - ')).exit();
});
有人可以解释发生了什么以及为什么我的网址不像他们那样干净吗?
答案 0 :(得分:0)
这个例子很好,因为你得到了网址,但有一点点噪音。看起来google同时改变了hrefs。所以你可以添加
links = links.map(function(link){
return link.substring(0, link.indexOf("&sa=U&ei=")).replace("/url?q=", "");
});
在join
最后一步中的链接之前。
答案 1 :(得分:0)
我实际上找到了一个使用名为getLinks()
的函数的工作。这很好用,应该更适合我需要的东西。使用split()
和pop()
的组合可以让您获得所需内容。
代码:
function getLinks() {
var links = document.querySelectorAll('h3.r a');
return Array.prototype.map.call(links, function(e) {
return e.getAttribute('href').split('/').pop();
});
}
casper.start(googleSearch, function() {
links = this.evaluate(getLinks);
});
casper.run(function() {
// echo results in some pretty fashion
this.echo(links.length + ' links found:');
this.echo(links.join('\n')).exit();
});