我对此进行了编码,但它仍然是非连续的。我希望它能够等到实际请求结束直到调用一个新的函数....但这不起作用。
问题1: page.open()调用不是连续的,如下所示:
6 protocol: https: type: Content 7 protocol: https: type: Content 8 protocol: https: type: Content 9 protocol: https: type: Content LINE: https://www.roller.de/einrichten/ 10 protocol: https: type: Content 11 protocol: https: type: Content 12 protocol: https: type: Content LINE: https://www.roller.de/einrichten/anrichte/ LINE: https://www.roller.de/einrichten/arbeitsstuhl/ LINE: https://www.roller.de/einrichten/arbeitstisch/ LINE: https://www.roller.de/einrichten/armlehnstuehle/ LINE: https://www.roller.de/einrichten/badezimmermoebel LINE: https://www.roller.de/einrichten/bistrostuehle/ LINE: https://www.roller.de/einrichten/buecherregal/ 13 protocol: https: type: Content 14 protocol: https: type: Content 15 protocol: https: type: Content 16 protocol: https: type: Content LINE: https://www.roller.de/einrichten/buerocontainer/ LINE: https://www.roller.de/einrichten/bueroregale/ 17 protocol: https: type: Content 18 protocol: https: type: Content
LINE:每个请求只应打印一次,但在没有page.open结果的情况下会出现几次,导致早期的stream.atEnd()= true。如果它是连续的,那应该是不可能的。
问题2:当我的.txt文件包含100个链接(每行1个)时,最后一行没有被删除,99个被打印,一个不是
问题3:当我给它一个包含1000个网址的列表时崩溃
问题4: 10个链接= 10个打印,100个链接= 98个打印,stream.atEnd()确实出现多次,500个链接= 497-498个打印+ stream.atEnd()问题, 1000个链接=崩溃
console.log('Hello, world!');
var fs = require('fs');
var stream = fs.open('100sitemap.txt', 'r');
var webPage = require('webpage');
var i = 1;
function nextPage() {
if (stream.atEnd()) {
//stream.close();
console.log("STREAM END: " + stream.atEnd());
console.log("FILE ENDS HERE");
//phantom.exit();
}
if (!stream.atEnd()) {
var line = stream.readLine();
console.log("LINE: " + line);
getRequest(line);
}
}
function getRequest(line2) {
//console.log(line);
var page = webPage.create();
page.settings.loadImages = false;
page.open(line2, function() {});
//console.log("page.open() " + line2);
//console.log("opened " + line2);
page.onResourceRequested = function(requestData, request) {
//console.log("BEFORE: " + requestData.url);
var match = requestData.url.match(/example.com\/ca/g)
//console.log("Match: " + match);
//console.log(request.url);
if (match != null) {
hasFound = true;
var targetString = decodeURI(JSON.stringify(requestData.url));
var klammerauf = targetString.indexOf("{");
var jsonobjekt = targetString.substr(klammerauf, (targetString.indexOf("}") - klammerauf) + 1);
targetJSON = (decodeURIComponent(jsonobjekt));
var t = JSON.parse(targetJSON);
console.log(i + " " + t['groups'] + " " + t['campID']);
i++;
//console.log(targetJSON);
request.abort;
}
};
page.onLoadFinished = function(status) {
if (!hasFound) {
console.log(i + " :NOT FOUND: " + line2);
i++;
}
//request.abort();
page.close();
nextPage();
}
}
nextPage();