我正在编写一个Web抓取工具,根据看起来像这样的列表发出多个请求
1. Category1
1a. categoryItem1
1b. categoryItem2
2. Category2
2a. categoryItem1
2b. categoryItem2
2c. categoryItem3
3. Category3
3a. categoryItem1
Category
和categoryItem
都是链接。 一次只能扩展一个Category
。 Categories
和categoryItems
的数量可能会发生变化,所以我不知道具体数量。
我正在收集每个categoryItem
页面上的数据,以便保存在json
中,看起来像这样
{
"Category1": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
}
],
"Category2": [
"categoryItem1: {
// Details saved here
},
"categoryItem2: {
// Details saved here
},
"categoryItem3: {
// Details saved here
}
],
"Category3": [
"categoryItem1: {
// Details saved here
}
]
}
唯一剩下的就是弄清楚如何让这个行为同步
Category
列表categoryItem
详细信息页面THIS是我遵循的网络刮板教程。由于异步调用,我不知道解析最后一页的时间,所以这里是脚本的结构
server.js
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function (req, res) {
globalJSON = {};
baseUrl = 'http://...';
// 1.) open page with list
request.get(baseUrl, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// select the list
$('#categoryListSelector').filter(function () {
var data = $(this);
var listItem = data.find('#listItemSelector');
var expansionLink = listItem.find('a').attr('href'); // <a href=""></a>
var category = listItem.find('font').text();
// Save category to global json
globalJSON[category] = [];
// 2.) Expand the list by opening expansionLink
request.get(baseUrl + expansionLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// Select the sub items of each list item
$('#subItem selector').filter(function () {
var data = $(this);
var categoryItemPageLinkElement = data.find('a');
var categoryItemName = categoryItemPageLinkElement.text();
var categoryItemLink = $(categoryItemPageLinkElement).attr('href');
if (typeof categoryItemLink != "undefinded" && categoryItemLink != null && categoryItemLink != "") {
categoryItemObject = {}; // { categoryItemName: categoryItemDetails }
categoryItemDetails = {};
// 3.) Open the categoryItem page to start gathering data
request.get(baseUrl + categoryItemLink, function (error, response, html) {
if (!error) {
var $ = cheerio.load(html);
// GATHER and save data here
// Done gathering data save to global json
categoryItemObject[categoryItemName] = categoryItemDetails;
globalJSON[category].push(categoryItemObject);
}
});
}
});
}
});
});
fs.writeFile('output.json', JSON.stringify(globalJSON, null, 4), function (err) {
console.log('File successfully written!');
});
res.send(globalJSON);
}//END if(!error)
});
})//END app.get()
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
我确实在下面的一位来自下方的帮助下解决了我的问题,这就是我想出来的。现在,可能有更好的方式,随时让我知道。
基本布局
Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}
//build an array of ALL the categoryItemLinks
return resolve(res, html);
});
}))).then(function(statesArray) {
Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
// Gather Data and put into dataJson
return resolve(response, html);
});
}))).then(function(data) {
// Do finishing stuff
}).catch(/*error*/);
}).catch(/*error*/);
server.js
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function (req, res) {
categoriesArr = [];
allCategoryItems = [];
dataJson = {}; // Global json to hold all the data
baseUrl = 'http://www.blahblah.org';
request.get(baseUrl, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
$('#categorySelector').filter(function() {
var data = $(this);
var categoryItemLink = data.find('a').attr('href');
categoriesArr.push({
"categoryItemLink": categoryItemLink
});
});
Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
if(error){
return reject(error);
}
var $ = cheerio.load(html);
$('#categoryItemSelector').filter(function() {
var data = $(this);
var categoryItemPageLinkElement = data.find('a');
var categoryItemPageLink = $(categoryItemPageLinkElement).attr('href');
if(typeof categoryItemPageLink != "undefinded" && categoryItemPageLink != null && categoryItemPageLink != "") {
allCategoryItems.push({
"categoryItemPageLink": categoryItemPageLink
});
}
});
return resolve(res, html);
});
}))).then(function(statesArray) {
Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
if(error){
return reject(error);
}
var $ = cheerio.load(html);
// Gather Data and put into dataJson
return resolve(response, html);
});
}))).then(function(data) {
// Do finishing stuff
}).catch(/*error*/);
}).catch(/*error*/);
}//END if(!error)
});
})//END app.get()
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
答案 0 :(得分:0)
您可以使用Promise.all(urls.map(url => new Promise((resolve, reject)=>{
request.get(url, (err, res, html)=>{
if(err){
return reject(err);
}
return resolve(res, html);
});
}))).then(/*success*/).catch(/*error*/);
,例如:
.then()
在该代码中,.htaccess
在所有请求都带回响应后执行。