Nodejs:比较两个异步请求的结果

时间:2016-11-23 05:01:55

标签: node.js asynchronous request web-crawler

我查看了有关此主题的其他问题,但在这种情况下无法解决如何实现它的问题。

我想要实现的目标:

  1. 访问网站并获取内容(正文)
  2. 访问匹配的测试网站并获取内容(正文)
  3. 比较内容
  4. 第1页的抓取链接
  5. 第2页的抓取链接
  6. 继续
  7. 我目前遇到的问题是我无法比较内容,因为请求不是在等待彼此。 这就是我的代码目前的样子。

    require('colors');
    var request = require('request');
    var cheerio = require('cheerio');
    var jsdiff = require('diff');
    var URL = require('url-parse');
    
    var PROD_START_URL = "https://www.somesite.org";
    var MAX_PAGES_TO_VISIT = 100;
    
    var pagesVisited = {};
    var numPagesVisited = 0;
    var pagesToVisit = [];
    
    var globalProdContent;
    var globalTestContent;
    
    var url = new URL(PROD_START_URL);
    var baseUrl = url.protocol + "//" + url.hostname;
    
    pagesToVisit.push(PROD_START_URL);
    crawl();
    
    function crawl() {
      if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
        console.log("Reached max limit of number of pages to visit.");
        return;
      }
      var nextPage = pagesToVisit.pop();
      if (nextPage in pagesVisited) {
        // We've already visited this page, so repeat the crawl
        crawl();
      } else {
        // New page we haven't visited
        visitPage(nextPage, crawl);
      }
    }
    
    function visitPage(url, callback) {
      // Add page to our set
      pagesVisited[url] = true;
      numPagesVisited++;
    
      // Make the request
      console.log("Visiting page " + url);
      request(url, function(error, response, body) {
         // Check status code (200 is HTTP OK)
         console.log("Status code: " + response.statusCode);
         if(response.statusCode !== 200) {
           callback();
           return;
         }
         // Parse the document body
         var $ = cheerio.load(body);
         globalProdContent = $("#wrapper").text();
    
         // Build new URL for test site
         var testURL = url.replace("https://www.somesite.org", "http://matching.testsite");
    
         // Scrape test site
         scrapeTestContent(testURL);
    
    
         collectInternalLinks($);
         callback();
      });
    }
    
    function collectInternalLinks($) {
        var relativeLinks = [];
        relativeLinks = $("a[href]");
    
        console.log("Found " + relativeLinks.length + " relative links on page");
        relativeLinks.each(function() {
            pagesToVisit.push(baseUrl + "/" + $(this).attr('href'));
        });
    }
    
    function scrapeTestContent(testURL) {
        console.log("Visiting matching testpage " + testURL);
        request(testURL, function(error, response, body) {
            console.log("Status code: " + response.statusCode);
            if(response.statusCode !== 200) {
                callback();
            return;
            }
    
            var $ = cheerio.load(body);
            globalTestContent = $("#wrapper").text();
            console.log(globalTestContent);
    
        });
    }
    

    是否有更简单的方法可以做到这一点,还是我完全偏离轨道?

1 个答案:

答案 0 :(得分:0)

这可以通过两种方式完成:  1.添加回调到scrapeTestContent

    function scrapeTestContent(testURL, cb) {
        ...
        request(testURL, function(error, response, body) {
            cb();
        });

    In visitPage,

    function visitPage(url, callback) {
        ...
        scrapeTestContent(testURL, () => collectInternalLinks($));
    }
  1. 使用ES6承诺。在scrapeTestContent()返回new Promise((resolve, reject) => {}。然后在visitPage中,使用以下构造:scrapeTestContent(testUrl).then(() => collectInternalLinks($))