NodeJS多个请求

时间:2016-10-18 18:34:41

标签: javascript node.js httprequest

我正在编写一个Web抓取工具,根据看起来像这样的列表发出多个请求

 1. Category1
    1a. categoryItem1
    1b. categoryItem2
 2. Category2
    2a. categoryItem1
    2b. categoryItem2
    2c. categoryItem3
 3. Category3
    3a. categoryItem1

CategorycategoryItem都是链接。 一次只能扩展一个Category CategoriescategoryItems的数量可能会发生变化,所以我不知道具体数量。

我正在收集每个categoryItem页面上的数据,以便保存在json中,看起来像这样

{
    "Category1": [
        "categoryItem1: {
            // Details saved here
        },
        "categoryItem2: {
            // Details saved here
        }
    ],
    "Category2": [
        "categoryItem1: {
            // Details saved here
        },
        "categoryItem2: {
            // Details saved here
        },
        "categoryItem3: {
            // Details saved here
        }
    ],
    "Category3": [
        "categoryItem1: {
            // Details saved here
        }
    ]
}

唯一剩下的就是弄清楚如何让这个行为同步

  1. 获取首页
  2. 打开每个Category列表
  3. 打开每个categoryItem详细信息页面
  4. 如果你想知道,

    THIS是我遵循的网络刮板教程。由于异步调用,我不知道解析最后一页的时间,所以这里是脚本的结构

    server.js

    var express = require('express');
    var fs = require('fs');
    var request = require('request');
    var cheerio = require('cheerio');
    var app = express();
    
    app.get('/scrape', function (req, res) {
    
        globalJSON = {};
    
        baseUrl = 'http://...';
    
        // 1.) open page with list
        request.get(baseUrl, function (error, response, html) {
            if (!error) {
    
                var $ = cheerio.load(html);
    
                // select the list
                $('#categoryListSelector').filter(function () {
                    var data = $(this);
    
                    var listItem = data.find('#listItemSelector');
    
                    var expansionLink = listItem.find('a').attr('href'); // <a href=""></a>
                    var category = listItem.find('font').text();
    
                    // Save category to global json
                    globalJSON[category] = [];
    
                    // 2.) Expand the list by opening expansionLink
                    request.get(baseUrl + expansionLink, function (error, response, html) {
                        if (!error) {
                            var $ = cheerio.load(html);
    
                            // Select the sub items of each list item
                            $('#subItem selector').filter(function () {
                                var data = $(this);
    
                                var categoryItemPageLinkElement = data.find('a');
    
                                var categoryItemName = categoryItemPageLinkElement.text();
                                var categoryItemLink = $(categoryItemPageLinkElement).attr('href');
    
                                if (typeof categoryItemLink != "undefinded" && categoryItemLink != null && categoryItemLink != "") {
    
                                    categoryItemObject = {}; // { categoryItemName: categoryItemDetails }
                                    categoryItemDetails = {};
    
                                    // 3.) Open the categoryItem page to start gathering data
                                    request.get(baseUrl + categoryItemLink, function (error, response, html) {
                                        if (!error) {
                                            var $ = cheerio.load(html);
    
                                            // GATHER and save data here
    
                                            // Done gathering data save to global json
                                            categoryItemObject[categoryItemName] = categoryItemDetails;
                                            globalJSON[category].push(categoryItemObject);
    
                                        }
                                    });
                                }
                            });
                        }
                    });
                });
    
                fs.writeFile('output.json', JSON.stringify(globalJSON, null, 4), function (err) {
                    console.log('File successfully written!');
                });
                res.send(globalJSON);
    
            }//END if(!error)
        });
    
    })//END app.get()
    
    app.listen('8081')
    console.log('Magic happens on port 8081');
    exports = module.exports = app;
    

    更新

    我确实在下面的一位来自下方的帮助下解决了我的问题,这就是我想出来的。现在,可能有更好的方式,随时让我知道。

    基本布局

    Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
            request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
                if(error){
                    return reject(error);
                }
    
                //build an array of ALL the categoryItemLinks
    
                return resolve(res, html);
            });
    }))).then(function(statesArray) {
    
            Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
                request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
                    if(error){
                        return reject(error);
                    }
                    // Gather Data and put into dataJson
    
                    return resolve(response, html);
                });
            }))).then(function(data) {
    
                // Do finishing stuff
    
            }).catch(/*error*/);
    
    }).catch(/*error*/);
    

    server.js

    var express = require('express');
    var fs = require('fs');
    var request = require('request');
    var cheerio = require('cheerio');
    var app = express();
    
    app.get('/scrape', function (req, res) {
    
        categoriesArr = [];
        allCategoryItems = [];
    
        dataJson = {}; // Global json to hold all the data
    
        baseUrl = 'http://www.blahblah.org';
    
        request.get(baseUrl, function(error, response, html) {
                if (!error) {
    
                    var $ = cheerio.load(html);
    
                    $('#categorySelector').filter(function() {
                        var data = $(this);
    
                        var categoryItemLink = data.find('a').attr('href');
    
                        categoriesArr.push({
                            "categoryItemLink": categoryItemLink
                        });
    
                    });
    
                    Promise.all(categoriesArr.map(categoryObj => new Promise((resolve, reject)=>{
                        request.get(baseUrl + categoryObj.categoryItemLink, (error, response, html)=>{
                            if(error){
                                return reject(error);
                            }
    
                            var $ = cheerio.load(html);
    
                            $('#categoryItemSelector').filter(function() {
                                var data = $(this);
                                var categoryItemPageLinkElement = data.find('a');
                                var categoryItemPageLink = $(categoryItemPageLinkElement).attr('href');
    
                                if(typeof categoryItemPageLink != "undefinded" && categoryItemPageLink != null && categoryItemPageLink != "") {
    
                                    allCategoryItems.push({
                                        "categoryItemPageLink": categoryItemPageLink
                                    });
    
                                }
                            });
    
                            return resolve(res, html);
                        });
                    }))).then(function(statesArray) {
    
                        Promise.all(allCategoryItems.map(categoryItemObject => new Promise((resolve, reject)=>{
                            request.get(baseUrl + categoryItemObject.categoryItemPageLink, (error, response, html)=>{
                                if(error){
                                    return reject(error);
                                }
                                var $ = cheerio.load(html);
                                // Gather Data and put into dataJson
    
                                return resolve(response, html);
                            });
                        }))).then(function(data) {
    
                            // Do finishing stuff
    
                        }).catch(/*error*/);
    
                    }).catch(/*error*/);
    
                }//END if(!error)
        });
    
    })//END app.get()
    
    app.listen('8081')
    console.log('Magic happens on port 8081');
    exports = module.exports = app;
    

1 个答案:

答案 0 :(得分:0)

您可以使用Promise.all(urls.map(url => new Promise((resolve, reject)=>{ request.get(url, (err, res, html)=>{ if(err){ return reject(err); } return resolve(res, html); }); }))).then(/*success*/).catch(/*error*/); ,例如:

.then()

在该代码中,.htaccess在所有请求都带回响应后执行。