在NodeJS中使用嵌套回调时遇到麻烦

时间:2015-10-29 22:28:33

标签: javascript node.js callback cheerio

我正在编写一个程序来搜索链接的网站,然后抓取这些链接以获取信息。为了刮取网站,有必要先登录。所以顺序是:登录 - >刮取链接的索引 - >抓取信息链接

对login函数的回调打印出一个空数组{ results: [], hasMore: true },所以我的代码有问题(抓取部分有效):

var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');

var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";

var credentials = {
    username: 'user1',
    password: 'passpass'
};

login(function (result) {
    console.log(result);
});

function login(callback) {
    request.post({
        uri: loginUrl,
        headers: { 'content-type': 'application/x-www-form-urlencoded' },
        body: require('querystring').stringify(credentials)
    }, function(err, res, body){
        if(err) {
            console.log("Login error");
            return;
        }
        scrapeTorrents(url1, function (result) {
            callback(result);
        });
    });
}

function scrapeTorrents(url, callback) {
    request(url, function(err, res, body) {
        if(err) {
            console.log("Main scrape error");
            return;
        }
        var links = []
        var $ = cheerio.load(body);
        $('span.title').each(function(i, element){
            var title = $(this);
            var a = $(this).children().eq(0);
            var detailsUrl = a.attr('href');
            //console.log(detailsUrl);
            links.push(detailsUrl);
        });
         scrapeTorrentDetails(links, function (result) {
             callback(result);
         });
    });
}

function scrapeTorrentDetails(links, callback) {
    var results = [];

    function getDetails(url) {
        request(url, function(err, res, body) {
                if(err) {
                    console.log("Detail scrape error");
                    return;
                }
                console.log("Scraping: " + url);
                var $ = cheerio.load(body);
                var tds = $('td');
                var title = $(tds).get(1).firstChild.data;
                var hash = $(tds).get(3).firstChild.data.trim();
                var size = $(tds).get(9).firstChild.data;
                //  console.log(tds.length);
                if (tds.length > 23) {
                    var rlsDate = $(tds).get(23).firstChild.data || '';;
                    var genres = $(tds).get(27).firstChild.data || '';;
                    var runtime = $(tds).get(31).firstChild.data || '';;
                    if ( $(tds).get(33).firstChild != null) {
                        var plot = $(tds).get(33).firstChild.data || '';;
                    }
                    var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                    var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                    var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                    var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                    if (typeof cover == 'undefined') {
                        cover = thumb;
                    }
                } else {
                    var rlsDate = "notfound";
                    var genres = "notfound";
                    var runtime = "notfound";
                    var plot = "notfound";
                    var rating = "notfound"; // of 10
                    var imdb_id = "notfound";
                    var cover = "notfound";
                    var thumb = "notfound";
                }

                var movie = {
                    type: 'movie',
                    imdb_id: imdb_id,
                    title: title,
                    year: rlsDate,
                    genre: genres,
                    rating: rating,
                    runtime: runtime,
                    image: thumb,
                    cover: cover,
                    synopsis: plot,
                    torrents: {
                        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                        filesize: size
                    }
                };

                results.push(movie);
            });
    }

    for (var i=0; i<links.length; i++){
            getDetails("https://example.org" + links[i]);
    }

    callback( {
        results: results,
        hasMore: true
    });
}

也许Q承诺会更好。我将如何在上面的代码中实现它?

如果你想知道代码的用途,我计划修改Popcorn-time以使用另一个torrent跟踪器(没有API)。

由于

2 个答案:

答案 0 :(得分:1)

主要问题在于此代码:

for (var i=0; i<links.length; i++){
        getDetails("https://example.org" + links[i]);
}

callback( {
    results: results,
    hasMore: true
});

getDetails()是异步的,但您只需将其称为links.length次并继续前进 - 就像他们已经完成一样。因此,在调用回调并尝试传递结果之前,getDetails()中的所有请求都没有完成。但是,结果都没有填写,所以它们将是空的。

你在代码中随处可见所有其他嵌套回调(根据需要),但是你把球放在这个地方。在使用结果调用最终回调之前,您需要知道所有getDetails()调用何时完成。

此外,您还必须决定是否可以并行呼叫所有getDetails()个呼叫(一次全部在线)或者您真正想做的是呼叫一个,等待为了它完成,然后调用下一个,等等...现在你将它们全部放在飞行中,如果目标服务器不能同时反对那么多请求,那么它可以工作。

有几种可能的解决方法。

  1. getDetails()添加一个回调,然后计算links.length来自getDetails()的{​​{1}}回调的时间,并且仅在整个计数完成后才计算打电话给最后的回电。

  2. 更改getDetails()以返回承诺。然后,您可以使用links.map(getDetails)之类的内容创建一系列承诺,然后您可以使用Promise.all()来了解它们何时完成。

  3. 就个人而言,我会更改您的所有代码以使用promises,并且我会使用Bluebird promises库来获取Promise.map()等额外功能,以使其更加简单。

    这是一个修补程序,它会向getDetails()添加一个回调,然后计算完成的数量:

    function scrapeTorrentDetails(links, callback) {
        var results = [];
    
        function getDetails(url, done) {
            request(url, function(err, res, body) {
                    if(err) {
                        console.log("Detail scrape error");
                        done(err);
                        return;
                    }
                    console.log("Scraping: " + url);
                    var $ = cheerio.load(body);
                    var tds = $('td');
                    var title = $(tds).get(1).firstChild.data;
                    var hash = $(tds).get(3).firstChild.data.trim();
                    var size = $(tds).get(9).firstChild.data;
                    //  console.log(tds.length);
                    if (tds.length > 23) {
                        var rlsDate = $(tds).get(23).firstChild.data || '';;
                        var genres = $(tds).get(27).firstChild.data || '';;
                        var runtime = $(tds).get(31).firstChild.data || '';;
                        if ( $(tds).get(33).firstChild != null) {
                            var plot = $(tds).get(33).firstChild.data || '';;
                        }
                        var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
                        var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
                        var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
                        var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
                        if (typeof cover == 'undefined') {
                            cover = thumb;
                        }
                    } else {
                        var rlsDate = "notfound";
                        var genres = "notfound";
                        var runtime = "notfound";
                        var plot = "notfound";
                        var rating = "notfound"; // of 10
                        var imdb_id = "notfound";
                        var cover = "notfound";
                        var thumb = "notfound";
                    }
    
                    var movie = {
                        type: 'movie',
                        imdb_id: imdb_id,
                        title: title,
                        year: rlsDate,
                        genre: genres,
                        rating: rating,
                        runtime: runtime,
                        image: thumb,
                        cover: cover,
                        synopsis: plot,
                        torrents: {
                            magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
                            filesize: size
                        }
                    };
    
                    results.push(movie);
                    done();
                });
        }
    
        var doneCnt = 0;
        for (var i=0; i<links.length; i++){
            getDetails("https://example.org" + links[i], function() {
                ++doneCnt;
                if (doneCnt === links.length) {
                    callback( {
                        results: results,
                        hasMore: true
                    });
                }
            });
        }
    
    }
    

答案 1 :(得分:0)

以下是使用bind重写的给定示例代码,自定义此对象以及尚未完成的请求计数(我认为promises掩盖了执行路径)。

回调返回空数组的原因似乎是文档中没有带有title属性的跨度,因此不会触发其他请求。

var
  request = require('request').defaults({
    jar: true
  }), // necessary for persistent login
  cheerio = require('cheerio'),
  process = require('process'),

  url1 = "https://example.org/torrents/browse/index/",
  loginUrl = "https://example.org/user/account/login/",

  login = function(callback) {
    request.post({
      uri: loginUrl,
      headers: {
        'content-type': 'application/x-www-form-urlencoded'
      },
      body: require('querystring').stringify({
        username: 'user1',
        password: 'passpass'
      })
    }, fna.bind({
      callback: callback
    }));
  },

  fna = function(err, res, body) {
    if (err) {
      console.log("Login error");
      return;
    }

    request(url1, fnb.bind(this));
  },

  fnb = function(err, res, body) {
    if (err) {
      console.log("Main scrape error");
      return;
    }

    var
      $ = cheerio.load(body),
      links = [],
      fnd = fne.bind(this);

    $('span.title').each(function() {
      links.push($(this).children().first().attr('href'));
    });

    this.results = [];
    this.resultCount = links.length;

    if (this.resultCount) {
      fnd = fnc.bind(this);

      for (var i = 0; i < links.length; i++) {
        request("https://example.org" + links[i], fnd);
      }
    } else {
      process.nextTick(fnd);
    }
  },

  fnc = function(err, res, body) {
    if (err) {
      console.log("Detail scrape error");
      return;
    }

    console.log("Scraping: " + url);

    var
      $ = cheerio.load(body),
      tds = $('td'),
      title = $(tds).get(1).firstChild.data,
      hash = $(tds).get(3).firstChild.data.trim(),
      size = $(tds).get(9).firstChild.data,
      rlsDate = "notfound",
      genres = "notfound",
      runtime = "notfound",
      plot = "notfound",
      rating = "notfound", // of 10
      imdb_id = "notfound",
      cover = "notfound",
      thumb = "notfound";

    if (tds.length > 23) {
      rlsDate = $(tds).get(23).firstChild.data || '';
      genres = $(tds).get(27).firstChild.data || '';
      runtime = $(tds).get(31).firstChild.data || '';

      if ($(tds).get(33).firstChild != null) {
        plot = $(tds).get(33).firstChild.data || '';
      }

      rating = $('#imdb_rating').parent().next().text() || ''; // of 10
      imdb_id = $('[name=imdbID]').get(0).attribs.value || '';
      cover = $('#cover').children().eq(0).get(0).attribs.href || '';
      thumb = $('[alt=Cover]').get(0).attribs.src || '';

      if (typeof cover == 'undefined') {
        cover = thumb;
      }
    }

    this.results.push({
      type: 'movie',
      imdb_id: imdb_id,
      title: title,
      year: rlsDate,
      genre: genres,
      rating: rating,
      runtime: runtime,
      image: thumb,
      cover: cover,
      synopsis: plot,
      torrents: {
        magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
        filesize: size
      }
    });

    this.resultCount--;

    if (this.resultCount === 0) {
      this.callback({
        results: this.results,
        hasMore: true
      });
    }
  },

  fne = function() {
    this.callback({
      results: this.results,
      hasMore: true
    });
  };

login(function(result) {
  console.log(result);
});