我正在尝试编写一个phantomjs脚本,该脚本将打印出所有' http://librivox.org/'此网页上的链接:
这是我的剧本:
var steps=[];
var testindex = 0;
var loadInProgress = false; //This is set to true when a page is still loading
var webPage = require('webpage');
var page = webPage.create();
var the_url = 'unknown';
page.onError = function(msg, trace) {
var msgStack = ['ERROR: ' + msg];
if (trace && trace.length) {
msgStack.push('TRACE:');
trace.forEach(function(t) {
msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
});
}
console.error(msgStack.join('\n'));
phantom.exit(1);
};
page.settings.userAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36';
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;//Script is much faster with this field set to false
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;
var system = require('system');
var args = system.args;
if (args.length === 1) {
console.log('usage: phantomjs --cookies-file=cookys.txt ./get-librivox-links-from-page.js');
} else {
args.forEach(function(arg, i) {
if ( i === 1 ) { the_url = arg; }
});
}
if ( the_url == 'unknown' ) { console.log('Please specify librivox url'); phantom.exit(); }
console.log( 'the_url is ' + the_url );
page.onConsoleMessage = function(msg) {
console.log(msg);
};
/**********DEFINE STEPS THAT FANTOM SHOULD DO***********************/
steps = [
function(url){
page.evaluate(function(url){
document.location.href = url;
},url);
},
function(){
page.evaluate(function(){
urls= [];
for (var i=document.links.length; i-->0;) {
if ( document.links[i].href.substring(0,20) == 'http://librivox.org/'.substring(0,20) ) {
console.log(document.links[i].href);
}
}
});
},
];
/**********END STEPS THAT FANTOM SHOULD DO***********************/
//Execute steps one by one
interval = setInterval(executeRequestsStepByStep,50);
function executeRequestsStepByStep(){
if (loadInProgress == false && typeof steps[testindex] == "function") {
if ( testindex == 0 ) {
steps[testindex](the_url);
} else {
steps[testindex]();
}
testindex++;
}
if (typeof steps[testindex] != "function") {
//We need to wait, after the steps is complete!
clearInterval(interval);interval=0;
setTimeout(function(){
setTimeout(phantom.exit,2000)
},3000);
}
}
/**
* These listeners are very important in order to phantom work properly.
* Using these listeners, we control loadInProgress marker which controls, weather a page is fully loaded.
* Without this, we will get content of the page, even a page is not fully loaded.
*/
page.onLoadStarted = function() { loadInProgress = true; };
page.onLoadFinished = function() { loadInProgress = false; };
page.onConsoleMessage = function(msg) { console.log(msg); };
为方便起见,我从一个小的shell脚本调用上面的脚本,如下所示:
$ cat run-get-librivox-links-from-page.sh
#!/bin/sh
script=/home/red/phantomjs/get-librivox-links-from-page.js
url=$1
if [ -z $url ]
then
echo "usage $0 <librivox url>"
exit 1
fi
/usr/bin/phantomjs --debug=false --cookies-file=cookys.txt \
$script $url
当我像这样运行脚本时:
$ ./run-get-librivox-links-from-page.sh "https://librivox.org/reader/251?primary_key=251&search_category=reader&search_page=4&search_form=get_results"
我的输出看起来像是从search_page
1而不是search_page
输出链接4:
the_url is https://librivox.org/reader/251?primary_key=251&search_category=reader&search_page=4&search_form=get_results
The page at https://librivox.org/reader/251?primary_key=251&search_category=reader&search_page=4&search_form=get_results displayed insecure content from http://archive.org/download/anythingycdo_mn_1302_librivox/anything_you_can_do_1302_thumb.jpg.
- above message repeated many times. remove for brevity. -
http://librivox.org/first-lensman-by-e-e-smith/
http://librivox.org/the-drums-of-jeopardy-by-harold-macgrath/
http://librivox.org/the-defiant-agents-by-andre-norton-2/
http://librivox.org/the-death-ship-by-william-clark-russell/
http://librivox.org/creatures-of-the-abyss-by-murray-leinster/
http://librivox.org/the-creature-from-beyond-infinity/
http://librivox.org/the-count-of-monte-cristo-by-alexandre-dumas/
http://librivox.org/the-cosmic-computer-by-h-beam-piper/
http://librivox.org/a-columbus-of-space-by-garrett-p-serviss/
http://librivox.org/the-colors-of-space-by-marion-zimmer-bradley-2/
http://librivox.org/the-colors-of-space-by-marion-zimmer-bradley/
http://librivox.org/the-city-at-worlds-end-by-edmond-hamilton/
http://librivox.org/citadel-of-fear-by-gertrude-barrows-bennett/
http://librivox.org/the-chessmen-of-mars-version-3-by-edgar-rice-burroughs/
http://librivox.org/the-bright-messenger-by-algernon-blackwood/
http://librivox.org/bat-wing-by-sax-rohmer/
http://librivox.org/at-the-earths-core-version-2-by-edgar-rice-burroughs/
http://librivox.org/astounding-stories-20-various/
http://librivox.org/astounding-stories-15-march-1931-by-ray-cummings/
http://librivox.org/astounding-stories-04-april-1930-by-ray-cummings/
http://librivox.org/astounding-stories-02-february-1930-by-various/
http://librivox.org/astounding-stories-01-january-1930-by/
http://librivox.org/anything-you-can-do-by-randall-garrett/