与Cherio平行刮削

时间:2020-04-11 09:46:49

标签: promise request cheerio request-promise

我正在尝试同时刮除具有多个部分和分页的网站。想法是浏览每页的每个部分。

例如,如果有6个部分,每个部分每页有6个项目(总页面最多10个),那么我希望代码在并行运行至少6个作业。

下面是我所拥有的

const cheerio = require('cheerio');
const request = require('request-promise');
const baseUrl = 'https://www.bankmega.com/';
let category = 0;
let page = 0;

(async function () {
    try {
        const homePage = baseUrl + '/promolainnya.php';
        const html = await request(homePage);
        const $ = cheerio.load(html);
        const jobs = $('div[id="subcatpromo"]').find('img').map((i, img) => scrapePerCategory({title: $(img).attr('title'), category: i + 1}));
        await Promise.all(jobs); // error  TypeError: undefined is not a function
    } catch (e) {
        console.log('error in main ', e);
    }
})();

const scrapePerCategory = async (job) => {
    try {
        let pageNumber;
        let i = 1;
        let result = [];
        console.log('start scraping for category ' + job.title);
        do {
            page = i;
            category = job.category;

            const url = baseUrl + `/ajax.promolainnya.php?product=1&subcat=${category}&page=${page}`;
            const html = await request(url);
            const $ = cheerio.load(html);
            if (!pageNumber) {
                pageNumber = $('a.page_promo_lain[id]').length;
            }
            const temp = $('#promolain').find('a').map(async (i, promoElem) => {
                const title = cheerio(promoElem).find('img').attr('title');
                const detailLink = cheerio(promoElem).attr('href');
                const detailHTML = await request(baseUrl + detailLink);
                const $ = cheerio.load(detailHTML);
                const imageurl = baseUrl + $('.keteranganinside').find('img').attr('src');
                console.log('category : ' + job.category + ' with item  => ' + JSON.stringify({title: title, imageurl: imageurl}));
                return {title: title, imageurl: imageurl};
            }).get();
            await Promise.all(temp).then(r => result.push(r));
            i++;
        } while (i <= pageNumber) ;
        await Promise.all(result).then((r) => "done scraping for category " + job.title);
        return result;
    } catch (e) {
        console.log('error in category', e);
    }
};

运行时它会按预期打印

start scraping for category Travel
start scraping for category Lifestyle
start scraping for category Food & Beverages
start scraping for category Gadget & Entertainment
start scraping for category Daily Needs
start scraping for category Others
category : 6 with item  => {"title":"Perubahan Minimum Payment","imageurl":"https://www.bankmega.com//files/images/minimum payment-lp- rev.jpg"}
category : 1 with item  => {"title":"Visa Bluebird Diskon hingga 25ribu","imageurl":"https://www.bankmega.com//files/images/0-landing-page-BLUE-BIRD.jpg"}
category : 6 with item  => {"title":"Aktivasi Kartu Kredit dan PIN","imageurl":"https://www.bankmega.com//files/images/AKTIVASI-CC-lp-CS5-revrainy.jpg"}

但是,当调用方(主方法给出错误)时,如下所示

error in main  TypeError: undefined is not a function
    at Function.all (<anonymous>)

这让我想知道代码是否确实按预期运行。

1 个答案:

答案 0 :(得分:1)

尽管在第一次迭代中发现pageNumber会使它有些混乱,但迭代方法应该可以工作。递归应该使它更整洁。

现在我花了很多时间,因为我需要出去参加法律练习,因此,这里有您的迭代版本,可以工作。您可能需要在这里和那里修复它。

const cheerio = require('cheerio');
const request = require('request-promise');
const baseUrl = 'https://www.bankmega.com/';

(async function () {
    try {
        const $ = cheerio.load(await request(baseUrl + '/promolainnya.php'));
        // map img elements to array of promises ...
        let promises = $('div[id="subcatpromo"]').find('img').get().map((img, i) => scrapePerCategory({'title': $(img).attr('title'), 'category': i + 1}));
        // ... and await the promises.
        const jobs = await Promise.all(promises);
        console.log(jobs);
    } catch (e) {
        console.log('error in main ', e);
    }
})();

const scrapePerCategory = async (job) => {
    try {
        let pageNumber;
        let page = 1; // both `page` and `i` counters seem unnecessary - one or the other?
        const results = [];
        do {
            let url = baseUrl + `/ajax.promolainnya.php?product=1&subcat=${job.category}&page=${page}`;
            let $ = cheerio.load(await request(url));
            if (!pageNumber) {
                pageNumber = $('a.page_promo_lain[id]').length;
            }
            // here compose `innerResults` in much the same way `results` is composed ...
            let innerResults = [];
            let anchors = $('#promolain').find('a');
            for(var i=0; i<anchors.length; i++) { // for loop here allows `await` to await
                let promoElem = cheerio(anchors[i]);
                let $ = cheerio.load(await request(baseUrl + promoElem.attr('href')));
                innerResults.push({
                    'title': promoElem.find('img').attr('title'), 
                    'imageurl': baseUrl + $('.keteranganinside').find('img').attr('src')
                });
            }
            // ... and aggregate `innerResults` into `results`
            results.push(innerResults); // or results = results.concat(innerResults); ?
            page++;
        } while (page <= pageNumber);
        console.log("done scraping for category " + job.title);
        return results;
    } catch (e) {
        console.log('error in category', e);
        throw e;
    }
};