我正在尝试同时刮除具有多个部分和分页的网站。想法是浏览每页的每个部分。
例如,如果有6个部分,每个部分每页有6个项目(总页面最多10个),那么我希望代码在并行运行至少6个作业。
下面是我所拥有的
const cheerio = require('cheerio');
const request = require('request-promise');
const baseUrl = 'https://www.bankmega.com/';
let category = 0;
let page = 0;
(async function () {
try {
const homePage = baseUrl + '/promolainnya.php';
const html = await request(homePage);
const $ = cheerio.load(html);
const jobs = $('div[id="subcatpromo"]').find('img').map((i, img) => scrapePerCategory({title: $(img).attr('title'), category: i + 1}));
await Promise.all(jobs); // error TypeError: undefined is not a function
} catch (e) {
console.log('error in main ', e);
}
})();
const scrapePerCategory = async (job) => {
try {
let pageNumber;
let i = 1;
let result = [];
console.log('start scraping for category ' + job.title);
do {
page = i;
category = job.category;
const url = baseUrl + `/ajax.promolainnya.php?product=1&subcat=${category}&page=${page}`;
const html = await request(url);
const $ = cheerio.load(html);
if (!pageNumber) {
pageNumber = $('a.page_promo_lain[id]').length;
}
const temp = $('#promolain').find('a').map(async (i, promoElem) => {
const title = cheerio(promoElem).find('img').attr('title');
const detailLink = cheerio(promoElem).attr('href');
const detailHTML = await request(baseUrl + detailLink);
const $ = cheerio.load(detailHTML);
const imageurl = baseUrl + $('.keteranganinside').find('img').attr('src');
console.log('category : ' + job.category + ' with item => ' + JSON.stringify({title: title, imageurl: imageurl}));
return {title: title, imageurl: imageurl};
}).get();
await Promise.all(temp).then(r => result.push(r));
i++;
} while (i <= pageNumber) ;
await Promise.all(result).then((r) => "done scraping for category " + job.title);
return result;
} catch (e) {
console.log('error in category', e);
}
};
运行时它会按预期打印
start scraping for category Travel
start scraping for category Lifestyle
start scraping for category Food & Beverages
start scraping for category Gadget & Entertainment
start scraping for category Daily Needs
start scraping for category Others
category : 6 with item => {"title":"Perubahan Minimum Payment","imageurl":"https://www.bankmega.com//files/images/minimum payment-lp- rev.jpg"}
category : 1 with item => {"title":"Visa Bluebird Diskon hingga 25ribu","imageurl":"https://www.bankmega.com//files/images/0-landing-page-BLUE-BIRD.jpg"}
category : 6 with item => {"title":"Aktivasi Kartu Kredit dan PIN","imageurl":"https://www.bankmega.com//files/images/AKTIVASI-CC-lp-CS5-revrainy.jpg"}
但是,当调用方(主方法给出错误)时,如下所示
error in main TypeError: undefined is not a function
at Function.all (<anonymous>)
这让我想知道代码是否确实按预期运行。
答案 0 :(得分:1)
尽管在第一次迭代中发现pageNumber
会使它有些混乱,但迭代方法应该可以工作。递归应该使它更整洁。
现在我花了很多时间,因为我需要出去参加法律练习,因此,这里有您的迭代版本,可以工作。您可能需要在这里和那里修复它。
const cheerio = require('cheerio');
const request = require('request-promise');
const baseUrl = 'https://www.bankmega.com/';
(async function () {
try {
const $ = cheerio.load(await request(baseUrl + '/promolainnya.php'));
// map img elements to array of promises ...
let promises = $('div[id="subcatpromo"]').find('img').get().map((img, i) => scrapePerCategory({'title': $(img).attr('title'), 'category': i + 1}));
// ... and await the promises.
const jobs = await Promise.all(promises);
console.log(jobs);
} catch (e) {
console.log('error in main ', e);
}
})();
const scrapePerCategory = async (job) => {
try {
let pageNumber;
let page = 1; // both `page` and `i` counters seem unnecessary - one or the other?
const results = [];
do {
let url = baseUrl + `/ajax.promolainnya.php?product=1&subcat=${job.category}&page=${page}`;
let $ = cheerio.load(await request(url));
if (!pageNumber) {
pageNumber = $('a.page_promo_lain[id]').length;
}
// here compose `innerResults` in much the same way `results` is composed ...
let innerResults = [];
let anchors = $('#promolain').find('a');
for(var i=0; i<anchors.length; i++) { // for loop here allows `await` to await
let promoElem = cheerio(anchors[i]);
let $ = cheerio.load(await request(baseUrl + promoElem.attr('href')));
innerResults.push({
'title': promoElem.find('img').attr('title'),
'imageurl': baseUrl + $('.keteranganinside').find('img').attr('src')
});
}
// ... and aggregate `innerResults` into `results`
results.push(innerResults); // or results = results.concat(innerResults); ?
page++;
} while (page <= pageNumber);
console.log("done scraping for category " + job.title);
return results;
} catch (e) {
console.log('error in category', e);
throw e;
}
};