我有一个api to crawler,我尝试使用global()。gc来减少每个循环后的堆内存,但它不起作用。请指出我做错了什么? 它还会导致堆内存不足问题
app.get('/test2', (req, res) => {
res.json({mes:'正在获取数据'});
array = [...]; //数组包含大约1000个元素作为链接
function something(){
let d = q.defer();
let urls = [];
array.forEach(function (mang, index) {
let tagArray = [];
tagArray = null;
tagArray = [];
//use this function to reduce the memory heap after looping each element of array
global.gc();
for (let i = 1; i <= 4000; i++) {
urls.push(function (callback) {
setTimeout(function () {
let link = 'http://something' + mang.link + '/tag-' + i;
//we will have about 4000 links due to i
let x = link;
let options = {
url: link,
headers: {
'User-Agent': 'MY IPHONE 7s'
}
};
function callback1(error, response, html) {
if (!error) {
let $ = whacko.load(html);
let tag_name = $('h1').text();
tag_name = tag_name.trim();
console.log(tag_name);
let tag_content = $('#content').find('div').contents();
tag_content = tag_content.toString();
if (tag_name !== "" && tag_content !== "") {
let tagObject = new Object();
tagObject.tag_name = tag_name;
tagObject.tag_content = tag_content;
tagObject.tag_number = i;
tagArray.push(tagObject);
if (tagArray.length == 4000) {
tagArray.sort(function (a, b) {
return parseInt(a.tag_number) - parseInt(b.tag_number);
});
for (let v = 0; v < tagArray.length; v++) {
db.query("INSERT INTO `tags` (tag_name, content, tag_number) " +
"SELECT * FROM (SELECT " + "'" + tagArray[v].tag_name + "'" + "," + "'" + tagArray[v].tag_content + "','" + tagArray[v].tag_number + "' as ChapName) AS tmp " +
"WHERE NOT EXISTS (SELECT `tag_name` FROM `tags` WHERE `tag_name`=" + "'" + tagArray[v].tag_name + "'" + ") " +
"LIMIT 1", (err) => {
if (err) {
console.log(err);
}
});
}
urls = null;
}
}
}
}
request(options, callback1);
callback(null, x);
}, 12000);
});
}
});
d.resolve(urls);
return d.promise;
}
something()
.then(function (data) {
let tasks = data;
console.log("start data");
async.parallelLimit(tasks, 40, () => {
console.log("DONE ");
});
})