CasperJS - 内存耗尽

时间:2016-04-07 16:00:24

标签: javascript memory memory-leaks web-scraping casperjs

当我通过命令行运行它时,它会持续一两个小时,然后命令行吐出“Memory Exhausted”。我无法弄清楚发生了什么。

此外,关于如何使其更具可读性或可修改性的一些一般性建议,因为我将在一个月内通过该项目。

var fs = require('fs');
var currentPhysician = [];
var physicianData = [];
var permitMax = 99999;
var alreadyParsed = [];
var targetFile = "CMQphysicians.csv";

var startTime = new Date().getTime();

var permitNumber = -1;

var firstLicense = 0;

var utils = require('utils');

String.prototype.contains = function (s) {

    return (this.indexOf(s) != -1);

}


var casper = require('casper').create({
verbose : true,
logLevel : "info",
pageSettings : {
loadImages : false, // do not load images
loadPlugins : false // do not load NPAPI plugins (Flash, Silverlight, ...)
}
});


function getPermitNumberString() {
    var pn = permitNumber.toString();

    var l = pn.length;
    var i;
    var leadingZeros = '';

    for (i = 0; i < (5 - pn.length); i++) {
        leadingZeros = leadingZeros + '0';
    }

    return leadingZeros + pn;

}

function getDetailsData() {
    var details = document.querySelectorAll('#content-html > table.griddetails > tbody > tr > td');
    return Array.prototype.map.call(details, function (e) {
        return e.innerText;
    });

}

function getPhysicianCount() {
    return document.querySelectorAll("#GViewList > tbody > tr:nth-child(2) > td:nth-child(1) > a").length;

}

casper.on("resource.error", function (resourceError) {
    if (!resourceError.url.contains('google')) {

        this.echo("Resource error: " + "Error code: " + resourceError.errorCode + " ErrorString: " + resourceError.errorString + " url: " + resourceError.url + " id: " + resourceError.id, "ERROR");
    }
    while (resourceError.errorString.contains('undefined')) {}
});

casper.on('load.started', function () {
    //casper.echo('load started');
});

casper.on('navigation.requested', function (url, navigationType, navigationLocked, isMainFrame) {
    //casper.echo('navigation requested');
    //casper.echo(navigationType);
});

casper.on('remote.message', function (msg) {
    this.echo('from within remote page DOM' + msg);
});

casper.start('https://www.google.ca/?gws_rd=ssl', function () { // Loads the initial page.
    casper.echo('Starting!');

});

casper.on('load.finished', function (status) {
    //casper.echo('load finished');

    var date = new Date();
    var hours = date.getHours();
    var minutes = date.getMinutes();
    //casper.echo(hours.toString() + ':' + minutes.toString() + '       ' + this.getCurrentUrl().toUpperCase());

    var urlPrefix = this.getCurrentUrl().substring(0, this.getCurrentUrl().indexOf('.aspx'));

    if (urlPrefix.length == 0) {
        casper.echo('undefined');
        urlPrefix = 'https://www.google.ca/?gws_rd=ssl'.toUpperCase();
    }

    switch (urlPrefix.toUpperCase()) {

    case 'https://www.google.ca/?gws_rd=ssl'.toUpperCase():
        casper.echo('on google');

        if (fs.exists('CMQphysicians.csv')) {

            stream = fs.open('CMQphysicians.csv', 'r');
            line = stream.readLine();
            var i = 0;
            while (line) {
                if (i > 0) {
                    alreadyParsed.push(Number(line.substring(0, line.indexOf(','))));
                }
                line = stream.readLine();
                i++;
            }
            stream.close();

            permitNumber = Math.max.apply(null, alreadyParsed) + 1;
            firstLicense = permitNumber;
            casper.echo(permitNumber);

        } else {

            fs.write(targetFile, "\uFEFF" + 'Permit Number,Last Name,First Name,Gender,Permit,Status,Specialty,Activity,Authorization,Address,Phone\n', 'a');

        }

        casper.thenOpen('http://www.cmq.org/bottin/index.aspx?lang=en&a=1');
        break;
    case 'http://www.cmq.org/bottin/index'.toUpperCase():
    casper.waitForSelector('#___gcse_0 > div > form > table.gsc-search-box > tbody > tr > td.gsc-search-button > input', function() {
    var finishedSoFar = permitNumber - firstLicense;
    var timeSoFar = new Date().getTime() - startTime;
    var licensesToDo = permitMax - permitNumber;
    var msPerLicense = timeSoFar / finishedSoFar;
    var minutesToGo = (licensesToDo * msPerLicense) / 1000 / 60;

    //casper.echo(licensesToDo + ' licenses to go. ' + msPerLicense.toString() + 'ms per license. ' + minutesToGo.toString() + ' minutes remaining.');
        casper.echo('index stage');
        permitNumber++;
        if (permitNumber > permitMax) {
            casper.echo('Permit number maxed out');
        } else {
            var permitNumberString = getPermitNumberString();
            casper.echo('going to list');
            casper.sendKeys('#txbNoPermis', permitNumberString);
            //casper.wait(100);
            casper.echo('sent keys, now clicking');
            casper.thenClick('#btSubmit');
            casper.echo('after the click');
        }
    });
    break;


    case 'http://www.cmq.org/bottin/list'.toUpperCase():
    casper.waitForSelector('#___gcse_0 > div > form > table.gsc-search-box > tbody > tr > td.gsc-search-button > input', function() {
        casper.echo('list stage');
        // Three cases:
        // No results, one result, many results
        // No results: go back (00000)
        // One result: go forward (82365)
        // Many results: crash (?????)


        a = casper.evaluate(getPhysicianCount);

        if (a == 0) {
            casper.echo('No physicians for license ' + getPermitNumberString());
            casper.echo('going to index');
            casper.thenClick('#btSubmit');
            //casper.wait(1000);
        } else if (a == 1) {
            casper.echo('Physician exists for license ' + getPermitNumberString());
            casper.echo('going to details');
            casper.thenClick('#GViewList > tbody > tr:nth-child(2) > td:nth-child(1) > a');
            //casper.wait(1000);
        } else if (a > 1) {
            casper.echo('a > 1 at ') + getPermitNumberString();
            while(true){}

        } else {
            casper.echo('negative a at ') + getPermitNumberString();
            while(true){}
        }

        // No results
    });
    break;

    case 'http://www.cmq.org/bottin/details'.toUpperCase():
    casper.waitForSelector('#___gcse_0 > div > form > table.gsc-search-box > tbody > tr > td.gsc-search-button > input', function() {
        casper.echo('details stage');
        var name = casper.getHTML('#content-html > table.griddetails > tbody > tr:nth-child(1) > th').substring(0, casper.getHTML('#content-html > table.griddetails > tbody > tr:nth-child(1) > th').indexOf('(')).trim().split(',');
        tableData = (casper.evaluate(getDetailsData));

        currentPhysician.push(tableData[4]);
        currentPhysician.push(name[0].trim());
        currentPhysician.push(name[1].trim());
        for (i = 2; i < tableData.length; i++) {
            if (i % 2 == 0 && i != 4) {
                currentPhysician.push(tableData[i]);

            }
        }

        for (i = 0; i < currentPhysician.length; i++) {
            currentPhysician[i] = currentPhysician[i].replace(/,/g, ';').replace(/\n/g, ';');
        }

        var physicianString = currentPhysician.join(',') + '\n';
        casper.echo('writing to file!');
        fs.write(targetFile, physicianString, 'a');

        currentPhysician = [];

        casper.echo(casper.exists('#btNewsearch'));
            casper.echo('going to index');
            casper.thenClick('#btNewsearch');
        //casper.wait(1000);
    });
    break;

    default:
        casper.echo("Wrong URL!");
        casper.back();
        break;


}});

casper.run(function () {
    casper.echo('ending!');

    casper.echo(physicianData.length);

});

1 个答案:

答案 0 :(得分:0)

由于一个错误:

https://bugs.webkit.org/show_bug.cgi?id=154452

通过关闭图像加载解决。

编辑:似乎仍然是一个问题。我的猜测是因为casperjs已经过时了,所以我放弃了它并使用python。