Question

我有一个非常简单的爬虫，它通过250页，分配大约400mb的内存，永远不会释放它。我不知道如何修复它，也许有人注意到了一些东西，并且让我知道。

function scrape(shop, o, cb, step) {

    var itemz = []

    var q = async.queue(function (o, cb) {
        req({
            url: o.url
        }, function (e, r) {
            if (e) throw (e)
            cb()
            o.cb(r.body)
        })
    }, o.threads)
    var get = function (url, cb) {
        q.push({
            url: url,
            cb: cb
        })
    }

    var url = 'https://www.host.com'
    var total, done = 0,
        itemsPerPage = 24

    get(url, function (r) {

        pages = (r.match(/data-page="(\d+)"/g));
        pages = pages[pages.length - 2].split("data-page=\"")[1].split('"')[0] || 1;
        pages = Math.min(pages, 10) // limit to 10 pages max (240 items)

        for (var i = 1; i <= pages; i++) {
            get(url + '&page=' + i, scrapeList)
        }
        total = pages + pages * itemsPerPage
    })

    // - extract the transaction links from the pages: 
    //   and add them to queue
    function scrapeList(r) {
        var itemsFound = 0

        r.replace(/href="(https:\/\/www.host.com\/listing\/(\d+).*)"/g, function (s, itemUrl, dateSold) {
            itemsFound++
            get(itemUrl, function (r) {
                scrapeItem(r, itemUrl, dateSold)
                step(++done, total)
                if (done == total) onend()
            })
        })

        total -= itemsPerPage - itemsFound // decrease expected items, if less items per page found than initially expected
        step(++done, total)
    }

    // - from item page extract the details, and add to items array
    function scrapeItem(r, itemUrl, dateSold) {

        var d = {}
        d.url = itemUrl;

        d.date = new Date(Date.now())

        d.quantity = 1;

        itemz.push(d)
    }

    // - when no more requests in a queue (on drain), group items by title
    function onend() {

        cb(null, itemz);

    }
}

Answer 1

我有一个类似的问题，我刮了一个主机并使用cheerio来解析html，但是内部使用lodash的cheerio有从未发布过的内存泄漏，所以我找到了一个触发GC的工作（垃圾收集器））定期释放内存，只需在重新定时间隔后调用global.gc();，同时使用标记--expose-gc运行脚本

例如：node <script>.js --expose-gc.

这不是一个理想的解决方案，但它可以快速修复像您这样的独立脚本请参阅here，也不要将间隔保持得太短，因为我注意到垃圾收集是CPU密集型的，并且还会延迟事件循环，所以每隔5到10秒就可以完成。

我还发现了一个关于v8垃圾收集的有趣读物here

nodejs内存泄漏（async.queue和request）

1 个答案: