异步GET请求未知的URL数

时间:2015-04-28 05:33:35

标签: node.js asynchronous web-scraping

制作未知数量的异步GET请求的最佳解决方案是什么?

例如,您有以下网址“http://www.example.org/”,并且您想要对以下路径发出一组异步请求:

[
    'http://www.example.org/A-1.html',
    'http://www.example.org/A-2.html',
    'http://www.example.org/A-3.html',
    '...',
    'http://www.example.org/B-1.html'
    'http://www.example.org/B-2.html'
    '...'
]

等等。例如,如果“... C-12.html”不存在,则会得到无效的响应标头代码。您将如何制作一组并行请求?

3 个答案:

答案 0 :(得分:0)

    var request=require('request');
    var vrlList = [
        'http://www.example.org/A-1.html',
        'http://www.example.org/A-2.html',
        'http://www.example.org/A-3.html',
        '...',
        'http://www.example.org/B-1.html'
        'http://www.example.org/B-2.html'
        '...'
    ];
    var outPutData = [];
    forEach(function(url,index){


  request({ method: 'GET', uri: url }, function (error, response, body) { 
       outPutData.push({"url":url,data:body});
    }).on('data', function(data) {

    }).on('error', function(err) {
          console.log(err)
    }).on('response', function(response) { 
       response.on('data', function(data) { 
       })
   })

});

答案 1 :(得分:0)

我认为这是一项算法任务。

假设你有A-Z(要处理26行)。 你也可以并行实现只有15个线程。

所以你加载<div id="fb-root"></div> <span id='fbinfo'><fb:name uid='loggedinuser' useyou='false'></fb:name></span> <div id="fb-root"></div> <span id='fbinfo'><fb:name uid='loggedinuser' useyou='false'></fb:name></span> <!-- USE 'Asynchronous Loading' version, for IE8 to work http://developers.facebook.com/docs/reference/javascript/FB.init/ <script type="text/javascript" src="<%=path%>/js/socialNetwork.js"></script> --> <script> FB.init({appId: appid, status: true, cookie: true, xfbml: true}); FB.Event.subscribe('auth.sessionChange', function(response) { if (response.status === 'connected') { // var session = FB.getSession(); fbtoken = response.authResponse.accessToken; fbuserid = response.authResponse.userID;; } // alert("fbtoken"+fbtoken); }); FB.getLoginStatus(function(response) { if (response.status === 'connected') { // var session = FB.getSession(); fbtoken = response.authResponse.accessToken; fbuserid = response.authResponse.userID;; } else{ loginFB(); } // alert("fbtoken 2"+fbtoken); }); function loginFB() { FB.login(function(response) { if (response.status === 'connected') { // var session = FB.getSession(); fbtoken = response.authResponse.accessToken; fbuserid = response.authResponse.userID;; } }, {scope:’create_event'}); // alert("fbtoken"+fbtoken); } function logoutFB() { FB.logout(function(response) { // user is now logged out }); } function createEvent(name, startTime, endTime, location, description) { var eventData = { "access_token": fbtoken, "start_time" : startTime, "end_time": endTime, "location" : location, "name" : name, "description": description, "privacy":"OPEN" }; FB.api("/me/events","post",eventData,function(response){ alert(response.error); if (!response || response.error) { alert("Response "+response.id); } else { alert("Post ID: " + response.id); } }); } function createMyEvent(){ var name = "My Amazing Event"; var startTime = "10/29/2015 12:00 PM"; var endTime = "10/29/2015 06:00 PM"; var location = "Dhaka"; var description = "It will be freaking awesome"; createEvent(name, startTime,endTime, location, description); alert(name); } </script> 数组中的前15行:process; 然后在A-O上启动多线程进程。 例如,当process未定义(未定义)时,您只需抛出异常并在B-56数组中加载下一行process,同时排除行P

这样,您将遍历所有剩余的行B

答案 2 :(得分:0)

这是我使用JQuery延迟对象提出的。

&#13;
&#13;
function getUrls(urls, callback) {
  var result,
    i = 0;
  var g = $.Deferred()
    .done(callback);
  urls.forEach(function(u, i) {
    $.getJSON(u, function(data) {
      // Do whatever processing you need to do here
      if (++i === dcnames.length) {
        g.resolve(result) // invoke the callback with the result
      }
    })
  })
}
&#13;
&#13;
&#13;

Deferred对象等待解析,直到最后一次调用。尽管forEach是一个同步(阻塞)进程,但由于getJSON本身是异步的,因此循环将并行运行所有请求,并等待它们全部完成后再解决。