为什么我的PhantomJS脚本从第一页而不是第四页输出链接?

时间:2017-07-19 04:31:03

标签: phantomjs

我正在尝试编写一个phantomjs脚本,该脚本将打印出所有' http://librivox.org/'此网页上的链接:

https://librivox.org/reader/251?primary_key=251&search_category=reader&search_page=4&search_form=get_results

这是我的剧本:

var steps=[];
var testindex = 0;
var loadInProgress = false;     //This is set to true when a page is still loading

var webPage = require('webpage');
var page = webPage.create();

var the_url             = 'unknown';

page.onError = function(msg, trace) {

  var msgStack = ['ERROR: ' + msg];

  if (trace && trace.length) {
    msgStack.push('TRACE:');
    trace.forEach(function(t) {
      msgStack.push(' -> ' + t.file + ': ' + t.line + (t.function ? ' (in function "' + t.function +'")' : ''));
    });
  }

  console.error(msgStack.join('\n'));
  phantom.exit(1);

};

page.settings.userAgent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36';
page.settings.javascriptEnabled = true;
page.settings.loadImages = false;//Script is much faster with this field set to false
phantom.cookiesEnabled = true;
phantom.javascriptEnabled = true;


var system = require('system');
var args = system.args;

if (args.length === 1) {
        console.log('usage:  phantomjs --cookies-file=cookys.txt ./get-librivox-links-from-page.js');
} else {
        args.forEach(function(arg, i) {
                if ( i === 1 ) { the_url = arg; }
        });
}

if ( the_url == 'unknown' ) { console.log('Please specify librivox url'); phantom.exit(); }

console.log( 'the_url is ' + the_url );

page.onConsoleMessage = function(msg) {
    console.log(msg);
};
/**********DEFINE STEPS THAT FANTOM SHOULD DO***********************/
steps = [
        function(url){
                page.evaluate(function(url){
                        document.location.href = url;
                },url);
        },
        function(){
                page.evaluate(function(){
                        urls= [];
                        for (var i=document.links.length; i-->0;) {
                                if ( document.links[i].href.substring(0,20) == 'http://librivox.org/'.substring(0,20) ) {
                                        console.log(document.links[i].href);
                                }
                        }
                });
        },
];

/**********END STEPS THAT FANTOM SHOULD DO***********************/

//Execute steps one by one
interval = setInterval(executeRequestsStepByStep,50);

function executeRequestsStepByStep(){
    if (loadInProgress == false && typeof steps[testindex] == "function") {
        if ( testindex == 0 ) {
                steps[testindex](the_url);
        } else {
            steps[testindex]();
        }
        testindex++;
    }
    if (typeof steps[testindex] != "function") {
  //We need to wait, after the steps is complete!
   clearInterval(interval);interval=0;
   setTimeout(function(){
   setTimeout(phantom.exit,2000)
   },3000);

    }
}

/**
 * These listeners are very important in order to phantom work properly.
 * Using these listeners, we control loadInProgress marker which controls, weather a page is fully loaded.
 * Without this, we will get content of the page, even a page is not fully loaded.
 */
page.onLoadStarted = function() { loadInProgress = true; };
page.onLoadFinished = function() { loadInProgress = false; };
page.onConsoleMessage = function(msg) { console.log(msg); };

为方便起见,我从一个小的shell脚本调用上面的脚本,如下所示:

$ cat run-get-librivox-links-from-page.sh
#!/bin/sh

script=/home/red/phantomjs/get-librivox-links-from-page.js
url=$1
if [ -z $url ]
then
        echo "usage $0 <librivox url>"
        exit 1
fi
/usr/bin/phantomjs --debug=false --cookies-file=cookys.txt \
$script $url

当我像这样运行脚本时:

$ ./run-get-librivox-links-from-page.sh "https://librivox.org/reader/251?primary_key=251&search_category=reader&search_page=4&search_form=get_results"

我的输出看起来像是从search_page 1而不是search_page输出链接4:

the_url is https://librivox.org/reader/251?primary_key=251&search_category=reader&search_page=4&search_form=get_results
The page at https://librivox.org/reader/251?primary_key=251&search_category=reader&search_page=4&search_form=get_results displayed insecure content from http://archive.org/download/anythingycdo_mn_1302_librivox/anything_you_can_do_1302_thumb.jpg.

- above message repeated many times. remove for brevity. -

http://librivox.org/first-lensman-by-e-e-smith/
http://librivox.org/the-drums-of-jeopardy-by-harold-macgrath/
http://librivox.org/the-defiant-agents-by-andre-norton-2/
http://librivox.org/the-death-ship-by-william-clark-russell/
http://librivox.org/creatures-of-the-abyss-by-murray-leinster/
http://librivox.org/the-creature-from-beyond-infinity/
http://librivox.org/the-count-of-monte-cristo-by-alexandre-dumas/
http://librivox.org/the-cosmic-computer-by-h-beam-piper/
http://librivox.org/a-columbus-of-space-by-garrett-p-serviss/
http://librivox.org/the-colors-of-space-by-marion-zimmer-bradley-2/
http://librivox.org/the-colors-of-space-by-marion-zimmer-bradley/
http://librivox.org/the-city-at-worlds-end-by-edmond-hamilton/
http://librivox.org/citadel-of-fear-by-gertrude-barrows-bennett/
http://librivox.org/the-chessmen-of-mars-version-3-by-edgar-rice-burroughs/
http://librivox.org/the-bright-messenger-by-algernon-blackwood/
http://librivox.org/bat-wing-by-sax-rohmer/
http://librivox.org/at-the-earths-core-version-2-by-edgar-rice-burroughs/
http://librivox.org/astounding-stories-20-various/
http://librivox.org/astounding-stories-15-march-1931-by-ray-cummings/
http://librivox.org/astounding-stories-04-april-1930-by-ray-cummings/
http://librivox.org/astounding-stories-02-february-1930-by-various/
http://librivox.org/astounding-stories-01-january-1930-by/
http://librivox.org/anything-you-can-do-by-randall-garrett/

0 个答案:

没有答案