Node.js:如何使用无限滚动分页来抓取网站?

时间:2019-04-28 16:59:11

标签: npm request infinite-scroll

我正在尝试使用无限滚动分页刮擦某个网站。到目前为止,我已经能够成功刮取“第一”页面,但不能成功刮除其余页面。 “ paginationNumber”(又称“页数”)的最大值波动,通常为5-10。我想以某种方式遍历paginationNumber尝试抓取所有内容,并将其全部合并到siteBody变量中,而不只是第一页中。是否有一个npm库?我在缠头时遇到麻烦。这是我用来实现仅第一页的抓取代码:

  //THE HTTP REQUEST
  //define the callback functions for the http request
  var callback2 = function(err, httpResponse2, body){
   siteBody = body;
   if (err){throw err;}
   else {
        newFileActions();
      }
    }
  }

  var paginationNumber=0;
  var callback1 = function(err, httpResponse, body, paginationNumber){
    if (err){console.log(err); throw err;}
    else {
      var options2 = {
        //NOTE: urlString2 ends in "page=" so that the paginationNumber comes next
        url: urlString2+paginationNumber
        ,gzip: true
        ,headers:{
          'Host': hostName
          ,'User-Agent': myUserAgent
          ,'Accept': myAccept
          ,'Accept-Language': myAcceptLanguage
          ,'Accept-Encoding': myAcceptEncoding
          ,'Referer': "https://"+hostName
          ,'Cookie': httpResponse.headers["set-cookie"][0].substr(0,httpResponse.headers["set-cookie"][0].indexOf(";"))
          ,'Connection': "keep-alive"
          ,'Upgrade-Insecure-Requests': "1"
        }
      };
    request(options2, callback2);
    }
  };
 var callback0 = function(err, httpResponse0, body){
    if (err){throw err;}
    else {
    var options1 = {
    url: urlString1
    ,headers:{
    'Host': hostName
    ,'User-Agent': myUserAgent
    ,'Accept': myAccept 
    ,'Accept-Language':myAcceptLanguage 
    ,'Accept-Encoding': myAcceptEncoding 
    ,'Referer': "https://"+hostName+'/login'
    ,'Cookie': httpResponse0.headers["set-cookie"][0].substr(0,httpResponse0.headers["set-cookie"][0].indexOf(";"))
    ,'Content-Type':  "application/x-www-form-urlencoded"
    ,'Content-Length': Buffer.byteLength(querystring.stringify(postData))
    ,'Connection': "keep-alive"
    ,'Upgrade-Insecure-Requests': "1"
    ,'DNT': "1"
    ,'TE': "Trailers"
    }
    ,form: {email: myEmail, password: myPassword}
    };
    request.post(options1, callback1);
    }
  }
 var options0 = {
    url: urlString0
    ,headers:{
    'Host': hostName
    ,'User-Agent': myUserAgent
    ,'Accept': myAccept 
    ,'Accept-Language':myAcceptLanguage 
    ,'Accept-Encoding': myAcceptEncoding 
    ,'Referer': "https://"+hostName
    ,'Connection': "keep-alive"
    ,'Upgrade-Insecure-Requests': "1"
    }
   };


  request(options0, callback0);

感谢您能提供的任何帮助

0 个答案:

没有答案