在尝试抓取网页时超过最大重定向?

时间:2017-04-01 22:48:47

标签: javascript node.js web-scraping

我正在尝试在Node.js中抓取一个网页。

var request = require('request').defaults({maxRedirects:3});


let url = "https://webapp4.asu.edu/catalog/classlist?k=81684&t=2177&e=all&hon=F&promod=F"
// , qs:propertiesObject
request({url:url}, function(err, response, body) {
  if(err) { console.log(err); return; }
  console.log("Get response: " + response.statusCode);
});

由于某种原因,我获得了最多的重定向。我可以在postman中执行get请求,我可以访问该页面。我该怎么做才能导致重定向发生?

1 个答案:

答案 0 :(得分:1)

最佳实践:在抓取网页之前,您应该始终检查robots.txt文件。我无法找到这个特定网站的一个,但如果你遇到一个不允许抓取的网站,你应该遵循所有规则。

话虽如此,由于传出请求中缺少标头,似乎您的刮刀卡在无限重定向循环中。

下面的内容将为您提供响应,但您需要确定需要进行哪些解析才能从中提取信息。

var request = require('request');

var options = {
  url: 'https://webapp4.asu.edu/catalog/classlist?k=81684&t=2177&e=all&hon=F&promod=F',
  headers: {
    "method":"GET",
    "path":"/catalog/classlist?k=math&t=2177&e=all&hon=F&promod=F",
    "scheme":"https",
    "accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "accept-encoding":"gzip, deflate, sdch, br",
    "accept-language":"en-US,en;q=0.8",
    "cache-control":"no-cache",
    "cookie":"JSESSIONID=javaprod19~413DF4150236B1466C8ECB85EB796C06.catalog19; onlineCampusSelection=C; __cfduid=d5e9cb96f2485f7500fec2116ee8f23381491087061; __utma=59190898.1874896314.1491088625.1491088625.1491088625.1; __utmb=59190898.2.10.1491088625; __utmc=59190898; __utmz=59190898.1491088625.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=137925942.2000995260.1491087063.1491087063.1491088718.2; __utmb=137925942.2.10.1491088718; __utmc=137925942; __utmz=137925942.1491088718.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ADRUM=s=1491089349546&r=https%3A%2F%2Fwebapp4.asu.edu%2Fcatalog%2Fclasslist%3F-1275642430",
    "pragma":"no-cache",
    "referer":"https://webapp4.asu.edu/catalog/",
    "upgrade-insecure-requests":"1",
    "user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
  }
};

function callback(error, response, body) {

    console.log(response.body)

}
request(options, callback);