伙计们,这是我另一个问题的后续内容,我创建了一个Nodejs Scraper,它似乎不想浏览所有页面,它停留在第一个页面上。我的源代码如下
const rp = require('request-promise');
const request = require('request');
const otcsv = require('objects-to-csv');
const cheerio = require('cheerio');
//URL To scrape
const baseURL = 'xxx';
const searchURL = 'xxxx';
//scrape info
const getCompanies = async () => {
// Pagination test
for (let index = 1; index <= 20; index = index + 1) {
const html = await rp.get(baseURL + searchURL + index);
const $ = await cheerio.load(html);
console.log("Loading Pages....");
console.log("At page number " + index);
// end pagination test
//const htmls = await rp(baseURL + searchURL);
const businessMap = cheerio('a.business-name', html).map(async (i, e) => {
const link = baseURL + e.attribs.href;
const innerHtml = await rp(link);
const emailAddress = cheerio('a.email-business', innerHtml).prop('href');
const name = e.children[0].data || cheerio('h1', innerHtml).text();
const phone = cheerio('p.phone', innerHtml).text();
return {
// link,
name,
emailAddress: emailAddress ? emailAddress.replace('mailto:', '') : '',
phone,
}
}).get();
return Promise.all(businessMap);
}
};
console.log("Finished Scraping.... Now Saving!")
//save to CSV
getCompanies()
.then(result => {
const transformed = new otcsv(result);
return transformed.toDisk('./output.csv');
})
.then(() => console.log('Scrape Complete :D '));
如您所见,我已经尝试了几种不同的方法来实现这一目标,因此我们将不胜感激任何帮助。