Question

我正在使用 jsdom （针对Node.js的网络抓取库）制作1到10个网络请求。它是这样的：

app.get('/results', function(req, res) {

jsdom.env(
  "http://website1.com",
  ["http://code.jquery.com/jquery.js"],
  function (errors, window) {
    // scrape website #1
  }
);

jsdom.env(
  "http://website2.com",
  ["http://code.jquery.com/jquery.js"],
  function (errors, window) {
    // scrape website #2
  }
);

jsdom.env(
  "http://website3.com",
  ["http://code.jquery.com/jquery.js"],
  function (errors, window) {
    // scrape website #3
  }
);
}

res.render('results', { items: items });
}

如何在所有 jsdom 请求完成后以及收集了所需的所有信息后，如何运行 res.render（）？在同步世界中，这显然不是问题，但由于javascript是异步的，res.render（）将在任何 jsdom 回调完成之前运行。

Answer 1

天真的解决方案

你可以用于少量擦除的“天真”解决方案是嵌套所有东西（在最后一次刮擦的回调中开始每次刮擦，最后一个回调包含渲染方法。）

scrape
  cb: scrape
     cb: scrape
        cb: render all results

当然，这变得乏味且难以辨认。（并且一切都是串联的，而不是平行的，这不会很快。）

更好的解决方案

更好的解决方案是编写一个函数来计算返回结果的数量，并在所有返回结果时调用render。这是一个实现：

function parallel_cb(total, finalCallback) {
    var done = 0;
    var results = [];
    return function(result) {
        done += 1;
        results.push(result);
        if (total == done) finalCallback(results);
    }
}

要在您的示例中使用它：

app.get('/results', function(req, res) {
    var myCallback = parallel_cb(
        sitesToScrape.count, // or 3 in this case
        function(items) {res.render('results', { items: items })});

    jsdom.env(
      "http://nodejs.org/dist/",
      ["http://code.jquery.com/jquery.js"],
      function (errors, window) {
        // do some scraping
        myCallback(result_from_scrape);
      }
    );

    jsdom.env(
      "http://nodejs.org/dist/",
      ["http://code.jquery.com/jquery.js"],
      function (errors, window) {
        // more scraping
        myCallback(result_from_scrape);
      }
    );

    jsdom.env(
      "http://nodejs.org/dist/",
      ["http://code.jquery.com/jquery.js"],
      function (errors, window) {
        // even more scraping
        myCallback(result_from_scrape);
      }
    );
});

最佳解决方案

您应该学会在问题的评论中使用@almypal建议的现有并行/异步库，而不是自己编写。

使用async，您可以按照文档中的说明做一些更整洁的事情：https://github.com/caolan/async#parallel

或者，如果您的所有搜索实际上在结果页面中查找相同的元素，您甚至可以在URL数组上执行并行映射以进行搜索：https://github.com/caolan/async#maparr-iterator-callback

每个抓取都可以使用async的并行方法提供的回调函数来返回其抓取的结果。最终[可选]回调将包含您对render的所有项目的调用。

编辑：您要求的示例

这是您的代码，直接翻译为async库：

var async = require("async");

app.get('/results', function(req, res) {
    async.parallel( // the first argument is an array of functions
      [
        // this cb (callback) is what you use to let the async
        // function know that you're done, and give it your result
        function (cb) { 
          jsdom.env(
            "http://nodejs.org/dist/",
            ["http://code.jquery.com/jquery.js"],
            function (errors, window) {
              // do some scraping      

              // async's callback expects an error for the first
              // param and the result as the second param
              cb(null, result_from_scrape); //No error
            }
          );
        },
        function (cb) { 
          jsdom.env(
            "http://nodejs.org/dist/",
            ["http://code.jquery.com/jquery.js"],
            function (errors, window) {
              // more scraping
              cb(null, result_from_scrape);
            }
          );
        },
        function (cb) { 
          jsdom.env(
            "http://nodejs.org/dist/",
            ["http://code.jquery.com/jquery.js"],
            function (errors, window) {
              // even more scraping
              cb(null, result_from_scrape);
            }
          );
        }
      ],
      // This is the "optional callback". We need it to render.
      function (err, results) {
        // If any of the parallel calls returned an error instead
        // of null, it's now in the err variable.
        if (err) res.render('error_template', {error: err});
        else res.render('results', { items: results });
      });
});

如何在Node.js中完成所有网络抓取请求后呈现页面？

1 个答案:

天真的解决方案

更好的解决方案

最佳解决方案

编辑：您要求的示例