我有一个非常简单的爬虫,它通过250页,分配大约400mb的内存,永远不会释放它。我不知道如何修复它,也许有人注意到了一些东西,并且让我知道。
function scrape(shop, o, cb, step) {
var itemz = []
var q = async.queue(function (o, cb) {
req({
url: o.url
}, function (e, r) {
if (e) throw (e)
cb()
o.cb(r.body)
})
}, o.threads)
var get = function (url, cb) {
q.push({
url: url,
cb: cb
})
}
var url = 'https://www.host.com'
var total, done = 0,
itemsPerPage = 24
get(url, function (r) {
pages = (r.match(/data-page="(\d+)"/g));
pages = pages[pages.length - 2].split("data-page=\"")[1].split('"')[0] || 1;
pages = Math.min(pages, 10) // limit to 10 pages max (240 items)
for (var i = 1; i <= pages; i++) {
get(url + '&page=' + i, scrapeList)
}
total = pages + pages * itemsPerPage
})
// - extract the transaction links from the pages:
// and add them to queue
function scrapeList(r) {
var itemsFound = 0
r.replace(/href="(https:\/\/www.host.com\/listing\/(\d+).*)"/g, function (s, itemUrl, dateSold) {
itemsFound++
get(itemUrl, function (r) {
scrapeItem(r, itemUrl, dateSold)
step(++done, total)
if (done == total) onend()
})
})
total -= itemsPerPage - itemsFound // decrease expected items, if less items per page found than initially expected
step(++done, total)
}
// - from item page extract the details, and add to items array
function scrapeItem(r, itemUrl, dateSold) {
var d = {}
d.url = itemUrl;
d.date = new Date(Date.now())
d.quantity = 1;
itemz.push(d)
}
// - when no more requests in a queue (on drain), group items by title
function onend() {
cb(null, itemz);
}
}
答案 0 :(得分:1)