同时废弃多个页面的最佳策略是什么?

时间:2011-11-28 21:04:10

标签: node.js web-scraping express

我按照示例

http://net.tutsplus.com/tutorials/javascript-ajax/how-to-scrape-web-pages-with-node-js-and-jquery/comment-page-1/

并且效果很好。

但在我的情况下,当用户提交表单时,我需要删除多个页面。

我的代码就像那样

app.get('/lookup', function (req, res) {

var pagesToScrap = [];
var callbackCounter = 0;
var items = [];

var callback = function(){
    if(pagesToScrap == callbackCounter){
        res.render('list', {
            title: "Hello World",
            items: items
        });
    }
    callbackCounter++;
}

var pageAResolver = function() {
    request.get({
            uri: 'http://a.com',
            //...
        items.push[jsonData];
        callback();
    );
}
var pageBResolver = function() {
    request.get({
            uri: 'http://b.com',
            //...
        items.push[jsonData];
        callback();
    );
}
var pageCResolver = function() {
    request.get({
            uri: 'http://c.com',
            //...
        items.push[jsonData];
        callback();
    );
}
pagesToScrap[0] = {url: "http://a.com", resolver: pageAResolver}
pagesToScrap[1] = {url: "http://b.com", resolver: pageBResolver}
pagesToScrap[2] = {url: "http://c.com", resolver: pageCResolver}

for(var i = 0; i < pagesToScrap.length; i++){
    pagesToScrap[i].resolver();
}
});

当所有请求都返回时,我将响应发送到浏览器。有时可能需要很多时间。没有缓存的最佳策略是什么,以更快地显示这些数据?

我正在考虑socket.io,也许我可以同时发出数据?伙计们,你怎么看待它?

干杯, Pablo Cantero

1 个答案:

答案 0 :(得分:1)

我建议使用https://github.com/caolan/async,一个很棒的异步工作流程库。

var async = require('async');

var pagesToScrape = [
  { url: "http://a.com" },
  { url: "http://b.com" },
  { url: "http://c.com" }
];

// `async.map` will fire off all requests simultaneously 
// and collect the results for you:

async.map(pagesToScrape, function(opts, callback) {

  request.get(opts, function(res) {
    // Do whatever analysis you need to get data from the page
    var jsonData = getJsonDataFrom(res);

    // Call callback when you've successfully scraped each page
    // The first parameter to callback is for "errors" (null if no error)
    // The second parameter is the jsonData object you want to use later.
    callback(null, jsonData);
  });
}, function done(err, items) {
  // This function gets called when all the pages are finished
  // items contains the list of jsonData returned to callback
  console.log(items.length); // -> 3

  var jsonData0 = items[0];
  console.log(jsonData0); // -> "{\"somedata":"fromthepage"...
});

如果您正在抓取大量页面,forEachLimit可能会更好 - 它会异步处理整个页面列表,但一次只能等待N页。