以下程序的目的是抓取CNN,并将其所有文本写入单个文件(使用几个第三方)
我得到了
RangeError: Maximum call stack size exceeded
如何解决此问题,我该如何绕过它?有没有办法我可以免费"记忆?怎么样?
//----------Configuration--------------
var startingUrl = "http://cnn.com"; //keep the http\https or www prefix
var crawlingDepth = "50";
var outputFileName = "cnn.txt";
//-------------------------------------
var Crawler = require("js-crawler");
var sanitizeHtml = require('sanitize-html');
var htmlToText = require('html-to-text');
var fs = require('fs');
var index = 0;
new Crawler().configure({depth: crawlingDepth})
.crawl(startingUrl, function onSuccess(page) {
var text = htmlToText.fromString(page.body, {
wordwrap: false,
hideLinkHrefIfSameAsText: true,
ignoreHref: true,
ignoreImage: true
});
index++;
console.log(index + " pages were crawled");
fs.appendFile(outputFileName, text, function (err) {
if (err) {
console.log(err);
};
console.log('It\'s saved! in same location.');
});
});
答案 0 :(得分:0)
1)这是递归深度的问题。
2)有必要避免它:
每个深度级别都由循环当前级别中的链接遍历(第一级是一个主要参考);
使用当前页面的“Crawler.prototype._getAllUrls”链接进行访问,如果这些链接尚未处理,则循环访问它们;
3)只有概念:
var Urls = [ ["http://cnn.com/"] ]; // What we crawling
var crawledUrls = {}; // Check if already crawled
var crawlingDepth = 3;
var depth = 0; // Current depth
var index = 0; // Current index
var Crawler = require("js-crawler");
function crawling() {
console.log(depth, index, Urls[depth][index]);
// Prepare next level
if (typeof Urls[depth+1] === "undefined") Urls.push([]);
// Already crawled flag
crawledUrls[ Urls[depth][index] ] = true;
new Crawler().configure({depth: 1}).crawl({
url: Urls[depth][index],
success: function(page) {
// Do some with crawled page
// Collect urls at crawled page
var urls = Crawler.prototype._getAllUrls( page.url, page.body );
for(var j=0; j<urls.length; j++) {
// Check same domain and now crawled yet
if ( typeof crawledUrls[urls[j]] === "undefined"
&& urls[j].indexOf(Urls[0][0])===0 ) {
Urls[depth+1].push(urls[j]);
}
}
},
failure: function(page) {
},
finished: function(crawled) {
index++;
if (index<Urls[depth].length) {
setTimeout(crawling,0);
} else {
depth++;
index = 0;
if (depth<crawlingDepth) {
setTimeout(crawling,0);
} else {
// Finished
}
}
}
});
}
crawling();