我正在尝试从页面中抓取产品网址,然后使用这些网址访问每个产品页面并抓取产品的信息。我正在尝试创建一个Promise链,但我是Promises的初学者。挂机试图在访问每个产品网址后将产品价格作为数组返回。我陷入了厄运的承诺链金字塔。如何运行某种循环来清理底部的这个承诺链?
const request = require('request');
const cheerio = require('cheerio');
const fs = require('fs');
const rp = require('request-promise');
// Thanks to chovy @ stack overflow
//https://stackoverflow.com/questions/21194934/node-how-to-create-a-directory-if-doesnt-exist
if (!fs.existsSync("./data")) {
fs.mkdirSync("./data"); // create the data folder it doesn't exist
}
let mikeshirturl = 'http://www.shirts4mike.com/shirts.php'; //shirt site entry point
let shirturlscrape = new Promise((resolve) => { // make a promise to find all of the shirturls
request(mikeshirturl, (error, response, html) => { // request the shirt site
if (!error) {
const $ = cheerio.load(html); // use cheerio to scrape the page
let shirtlinks = []; //make an array to hold the links to each shirt
$('.products').filter(function() {
let data = $(this);
data.find('li a').each(function(i) { // scrape the hrefs from the shirt links
shirtlinks.push($(this).attr('href'));
});
let shirturls = shirtlinks.map((i) => {
return mikeshirturl.slice(0,27) + i; // make a new array of the completed shirturls
}); // end shirtlink map
resolve(shirturls); // send the urls back to the promise
}); // end products filter
} // end error check
}); // end request url
}); // end the promise
let shirtPrices = []; // array to hold prices
let shirtTitles = []; // array to hold titles
let shirtImageUrls = []; // array to hold shirtimages
let dates = []; // array to hold dates
let shirtDataHeaders = { // the headers for the csv
title: "Title",
price: "Price",
imageUrl: "ImageUrl",
Url: "Url",
Time: "Time"
};
let shirtData = []; // the complete array of shirt data
let d = new Date(); // get the date
let month = d.getMonth() + 1; // set the month
let day = d.getDate(); // set the day
let year = d.getFullYear(); // set the year
let currentDate = "" + year + "-" + month + "-" + day; // concatenate date to correct order
let getShirtData = (html) => { // function to scrape shirt data from shirt urls
const $ = cheerio.load(html);
$('.shirt-details').filter(function() {
let data = $(this);
shirtPrices.push(data.find('.price').text()); //get shirt prices from price class element
let shirtTitle = String(data.children().first().clone().children().remove().end().text()); //get shirt titles from h1 element
shirtTitle = shirtTitle.replace(/,/g , " -"); // replace commas in titles with dashes to not confuse csv file
shirtTitles.push(shirtTitle); //push shirt titles to array
});
$('.shirt-picture').filter(function() {
let data = $(this);
shirtImageUrls.push('' + mikeshirturl.slice(0,27) + data.find('img').attr('src')); // push shirt image urls to array
});
dates.push(currentDate); // push the date of scrape to array
};
let commaseparate = (callback) => { // functino to put data in csv format
let keys = Object.keys(shirtDataHeaders);
let values = Object.values(shirtDataHeaders);
let result = values.join(",") + "\n";
// Add the rows
callback.forEach((obj)=>{
keys.forEach((k, ix)=>{
if (ix) result += ",";
result += obj[k];
});
result += "\n";
});
shirtData = result;
return shirtData;
}
shirturlscrape // really wild attempt to follow the promise chain down from scraping each shirt url.
.then((result) => {
rp(result[0])
.then(getShirtData)
.then(() => {
rp(result[1])
.then(getShirtData)
.then(() => {
rp(result[2])
.then(getShirtData)
.then(() => {
rp(result[3])
.then(getShirtData)
.then(() => {
rp(result[4])
.then(getShirtData)
.then(() => {
rp(result[5])
.then(getShirtData)
.then(() => {
rp(result[6])
.then(getShirtData)
.then(() => {
rp(result[7])
.then(getShirtData)
.then(() => {
for (let i=0; i<result.length; i++) { //push each shirt data object to the final shirtdata array
shirtData.push({title: shirtTitles[i], price: shirtPrices[i], imageUrl: shirtImageUrls[i], Url: result[i], Time: dates[i]})
}
commaseparate(shirtData); // turn the data into csv formatting
fs.writeFile("./data/" + currentDate + '.csv' , shirtData); // write the new csv file to the data folder
});
});
});
});
});
});
});
});
})
.catch(()=>{ //check if there is an error on the initial mikeshirt site http request
console.error(`There’s been a 404 error. Cannot connect to http://shirts4mike.com.`);
});