如何按顺序进行page.open()请求?

时间:2015-09-09 15:27:53

标签: javascript phantomjs httprequest

我对此进行了编码,但它仍然是非连续的。我希望它能够等到实际请求结束直到调用一个新的函数....但这不起作用。

问题1: page.open()调用不是连续的,如下所示:

6       protocol: https:     type: Content
7       protocol: https:     type: Content
8       protocol: https:     type: Content
9       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/
10       protocol: https:     type: Content
11       protocol: https:     type: Content
12       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/anrichte/
LINE: https://www.roller.de/einrichten/arbeitsstuhl/
LINE: https://www.roller.de/einrichten/arbeitstisch/
LINE: https://www.roller.de/einrichten/armlehnstuehle/
LINE: https://www.roller.de/einrichten/badezimmermoebel
LINE: https://www.roller.de/einrichten/bistrostuehle/
LINE: https://www.roller.de/einrichten/buecherregal/
13       protocol: https:     type: Content
14       protocol: https:     type: Content
15       protocol: https:     type: Content
16       protocol: https:     type: Content
LINE: https://www.roller.de/einrichten/buerocontainer/
LINE: https://www.roller.de/einrichten/bueroregale/
17       protocol: https:     type: Content
18       protocol: https:     type: Content

LINE:每个请求只应打印一次,但在没有page.open结果的情况下会出现几次,导致早期的stream.atEnd()= true。如果它是连续的,那应该是不可能的。

问题2:当我的.txt文件包含100个链接(每行1个)时,最后一行没有被删除,99个被打印,一个不是

问题3:当我给它一个包含1000个网址的列表时崩溃

问题4: 10个链接= 10个打印,100个链接= 98个打印,stream.atEnd()确实出现多次,500个链接= 497-498个打印+ stream.atEnd()问题, 1000个链接=崩溃

console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('100sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;

function nextPage() {
    if (stream.atEnd()) {
        //stream.close();
        console.log("STREAM END: " + stream.atEnd());
        console.log("FILE ENDS HERE");
        //phantom.exit();
    }
    if (!stream.atEnd()) {
        var line = stream.readLine();
        console.log("LINE: " + line);
        getRequest(line);
    }
}

function getRequest(line2) {
    //console.log(line);
    var page = webPage.create();
    page.settings.loadImages = false;
    page.open(line2, function() {});
    //console.log("page.open() " + line2);
    //console.log("opened " + line2);
    page.onResourceRequested = function(requestData, request) {
        //console.log("BEFORE: " + requestData.url);
        var match = requestData.url.match(/example.com\/ca/g)
        //console.log("Match: " + match);
        //console.log(request.url);
        if (match != null) {
            hasFound = true;
            var targetString = decodeURI(JSON.stringify(requestData.url));
            var klammerauf = targetString.indexOf("{");
            var jsonobjekt = targetString.substr(klammerauf,      (targetString.indexOf("}") - klammerauf) + 1);
            targetJSON = (decodeURIComponent(jsonobjekt));
            var t = JSON.parse(targetJSON);
            console.log(i + "       " + t['groups'] + "     " +    t['campID']);
            i++;
            //console.log(targetJSON);
            request.abort;
        }
    };
    page.onLoadFinished = function(status) {
        if (!hasFound) {
            console.log(i + " :NOT FOUND: " + line2);
            i++;
        }
        //request.abort();
        page.close();
        nextPage();
    }
}

nextPage();

0 个答案:

没有答案