在Casper重复函数

时间:2016-01-28 07:20:02

标签: javascript casperjs

我的目标是获取作业网站的每个作业链接,通过以下作业链接转到每个作业详细信息页面,通过CASPERJS下载并保存html中的详细信息。 每当我们回来时,每个工作链接的ID都会改变。在作业链接和作业详细信息页面之间,我需要每次在casper.repeat下获取所有作业ID。但NoOfLink数组在重复函数之外变为空[我在代码中注释该部分]。问题是什么?

var casper = require('casper').create();

var noOfRecordsToLoop = 0;
var TotalNoofNullElement = 0;
var NoOfLink = [];

var x = require('casper').selectXPath;

casper.echo('\nStart loding site......');

//---------------------------------------------Load and Scroll the site---------------------------------------//

casper.start('https://........../...../.......Careers/');

casper.wait(10000, function () {
    //---------Total no of Job posting------//

    var noOfRecords = this.fetchText(x('//*[@id="...........................jobProfile......"]'));
    noOfRecordsToLoop = noOfRecords.replace(/[^0-9]/g, "");
    var totalNoOfPage = Math.ceil(parseInt(noOfRecords) / 50);

    casper.echo('\nStart scrolling site......');

    casper.repeat(totalNoOfPage, function () {
        this.scrollToBottom(); //-----------------------Scroll down
        casper.wait(10000, function () {})
    })

})

//------------------------------------------------Load and Scroll the site---------------------------------------//


casper.then(function () {
    //-----------------------------------------Get all the link elements  --------------------------//

    var countForLink = 0;
    var numTimesForRpt = noOfRecordsToLoop;
    var numTimes = noOfRecordsToLoop;

    casper.repeat(numTimesForRpt, function () {

        RetElement = this.evaluate(function () {
            var startingRow = '//*[contains(@id, "...-uid-")]'
                var element = __utils__.getElementByXPath(startingRow).getAttribute('id');
            return element;
        });

        var count = RetElement.replace(/[^0-9]/g, "");

        casper.repeat(numTimes, function () {
            var MatchElements = this.evaluate(function (count) {
                    var xp = '//*[contains(@id, "...-uid-' + count + '")]'
                        var element = __utils__.getElementByXPath(xp).getAttribute('id');
                    return element;
                }, count++);

            if (!MatchElements) {
                TotalNoofNullElement = TotalNoofNullElement + 1
            } else {
                NoOfLink.push(MatchElements);
            }

            //**Here array elements are accessible**
            for (var k = 0; k < NoOfLink.length; k++) {
                this.echo(NoOfLink[k]);
            }

        });

        //**But here array elements are not accessible outside of repeat** function
        this.echo("Size of array is" + NoOfLink.length);

        for (var q = 0; q < NoOfLink.length; q++) {
            this.echo(NoOfLink[q]);
        }

        //-----------------------------------------Get all the link elements----------------------------//

        //------------------------------------Go to the Job Detail Page Extract HTML and Save---------------------------//

        this.echo("\n Inside repeat to Generate HTML");
        var num = NoOfLink[countForLink];
        this.echo("\nLink id is " + NoOfLink[countForLink]);
        num = parseInt(num.replace(/[^0-9]/g, ""));
        this.echo("\nNum is " + num);

        //-----------------Click to go to the Job Detail Page------------------//

        casper.thenClick(x('//*[@id="..-uid-' + num + '"]/div/div'));

        casper.wait(5000, function getJobDetail() {

            var content = this.getElementInfo(x(".//*[contains(@id,'......t-uid-')]")).html;
            var divStart = '<div id="extrdHtml">'
                var divEnd = '</div>'
                var body = divStart + content + divEnd

                this.echo("\nContent of Job detail :" + body);

            var fs = require('fs');

            fs.write('extractedJob' + NoOfLink[countForLink] + '.html', body, 'w');

            this.echo("\nFile saved");

            //------------------------------------Go to the Job Detail Page Extract HTML and Save---------------------------//

        }); //casper.wait

        casper.back();

        casper.wait(5000);

        countForLink++

    }); //casper.repeat

}); //casper.then

//-------------------------------------------Get all the link elements------------------------------//

casper.run();

有两个重复循环。

  1. casper.repeat(numTimesForRpt, function () { - 这是主要的外循环,第二个循环所在的位置。
  2. casper.repeat(numTimes, function () - 我在哪里获取链接并填充NoOfLink数组。我试图在第二个循环之外(在主外循环内)获取数组元素值,但它不起作用。

1 个答案:

答案 0 :(得分:0)

所有then*wait*函数都是异步步骤函数。如果您调用它们,则需要安排在当前步骤结束时执行的步骤。 casper.repeat()uses casper.then()的函数,因此也是异步的。 casper.repeat()之后的每个同步代码都将在 repeat回调的内容之前执行

您有两种选择:

  1. casper.repeat()casper.then()之后的所有内容换行,以使其异步或
  2. 使用正常的同步循环而不是repeat,如果repeat的回调不需要像您的情况那样异步进行评估
  3. 顺便说一句,通过利用CasperJS提供的辅助函数,您可以经常减少代码。例如,您不需要仅使用evaluate()来获取XPath的某些元素的id属性。您可以使用casper.getElementsAttribute()执行此操作。

    示例:

    var count = RetElement.replace(/[^0-9]/g, "");
    
    for(var i = count; i < (numTimes + count); i++) {
        var MatchElements = this.getElementsAttribute(x('//*[contains(@id, "...-uid-' + i + '")]'), 'id');
    
        if (!MatchElements) {
            TotalNoofNullElement = TotalNoofNullElement + 1
        } else {
            NoOfLink.push(MatchElements);
        }
    
        //**Here array elements are accessible**
        for (var k = 0; k < NoOfLink.length; k++) {
            this.echo(NoOfLink[k]);
        }
    }