梦魇,PhantomJS并提取从javascript生成的动态数据

时间:2015-08-26 06:08:39

标签: javascript web-scraping phantomjs

我正在尝试使用phantomjs从url https://app.kabuto.com/auth中抓取页面内容(尽管我的目标是最终在python中使用selenium phantomjs webdriver)。幻影脚本下面运行良好,没有错误但仍然无法获取页面文本('电子邮件','密码'等)。这些是表单元素和标题,我可以从页面的'Inspect Element'看到,而'View Page Source'只包含javascript。我可能会犯一些错误,因为我是幻影的新手,缺乏javascript知识,任何帮助都表示赞赏!

try {
    var system = require('system'); 
    var url = 'https://app.kabuto.com/auth'
    var page = require('webpage').create();

    console.log('The default user agent is ' +   page.settings.userAgent);
    page.settings.userAgent = "SpecialAgent"

    function printArgs() {
        var i, ilen;
        for (i = 0, ilen = arguments.length; i < ilen; ++i) {
            console.log("    arguments[" + i + "] = " +  JSON.stringify(arguments[i]));
        }
        console.log("");
    }
    page.onInitialized = function() {
        console.log("page.onInitialized");
        printArgs.apply(this, arguments);
    };
    page.onResourceError = function(resourceError) {
        page.reason = resourceError.errorString;
        page.reason_url = resourceError.url;
    };
    page.onLoadStarted = function() {
        console.log("page.onLoadStarted");
        printArgs.apply(this, arguments);
    };
    page.onLoadFinished = function() {
        console.log("page.onLoadFinished");
        printArgs.apply(this, arguments);
    };
    page.onResourceRequested = function(request) {
        console.log('Request ' + JSON.stringify(request, undefined, 4));
    };
    page.onResourceReceived = function(response) {
        console.log('Receive ' + JSON.stringify(response, undefined, 4));
    };
    page.onError = function(msg) {
        console.log('**some js error**' + msg);
    };    
    page.onConsoleMessage = function(msg, lineNum, sourceId) {
        console.log('CONSOLE: ' + msg);
    };
    page.open(url, function (status) {
        if (status !== 'success') {
            console.log( 'Unable to access network' );
        } 
        else {
            window.setTimeout(function () {
                page.evaluate(function(){
                    console.log('innerText is:' + window.document.body.innerText);
                    console.log('outerHTML is: ' + document.documentElement.outerHTML);
                });
                phantom.exit();    
            }, 300);
        }
    });
}

catch(e) {
    console.log("Error: " + e.description);
}

0 个答案:

没有答案