无法抓取ally.com

时间:2017-03-18 05:46:44

标签: javascript node.js node-modules

我能够抓取像nature.com,flipkart.com这样的网站。它运作得很好。但是当我试图抓住ally.com时,nike.com。它返回状态代码403并表示未定义。这是我的代码

// crawlerqueue.js

var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var pa11y=require('pa11y');

var START_URL = "http://www.nature.com/";
//var SEARCH_WORD = "stemming";
var MAX_PAGES_TO_VISIT = 100;

var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;

pagesToVisit.push(START_URL);
crawl();

function crawl() {
  if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
    console.log("Reached max limit of number of pages to visit.");
    return;
  }
  var nextPage = pagesToVisit.pop();
  if (nextPage in pagesVisited) {
    // We've already visited this page, so repeat the crawl
    crawl();
  } else {
    // New page we haven't visited
    visitPage(nextPage, crawl);
  }
}

function visitPage(url, callback) {
  // Add page to our set
  pagesVisited[url] = true;
  numPagesVisited++;

  // Make the request
  console.log("Visiting page " + url);
  request(url, function(error, response, body) {
     // Check status code (200 is HTTP OK)
     console.log("Status code: " + response.statusCode);
     if(response.statusCode !== 200) {
       callback();
       return;
     }
     // Parse the document body
     var $ = cheerio.load(body);
     /*var isWordFound = searchForWord($, SEARCH_WORD);
     if(isWordFound) {
       console.log('Word ' + SEARCH_WORD + ' found at page ' + url);
     } else*/ {
       collectInternalLinks($);
       // In this short program, our callback is just calling crawl()
       callback();
     }
  });
}

function searchForWord($, word) {
  var bodyText = $('html > body').text().toLowerCase();
  return(bodyText.indexOf(word.toLowerCase()) !== -1);
}

function collectInternalLinks($) {
    var relativeLinks = $("a[href^='/']");
    console.log("Found " + relativeLinks.length + " relative links on page");
    relativeLinks.each(function() {
        pagesToVisit.push(baseUrl + $(this).attr('href'));
    });
}

我通过命令行运行此代码。 nature.com的输出如下:

Visiting page http://www.nature.com/
Status code: 200
Found 23 relative links on page
Visiting page http://www.nature.com/scitable/sponsors
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/pressnews
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/contact
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/about
Status code: 200
Found 25 relative links on page
Visiting page http://www.nature.com/scitable/my-profile/social-settings
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/photocredit
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/presscontact
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/presskit
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/pressroom
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/sponsorship
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/topicpage/copy-number-
Status code: 200
Found 89 relative links on page
Reached max limit of number of pages to visit.

但是当我尝试抓取nike.com或ally.com时,我会看到以下错误

Visiting page http://www.ally.com
Status code: 403
Visiting page undefined
C:\Users\dashboard-master\node_modules\request\index.js:45
    throw new Error('undefined is not a valid uri or options object.')
    ^

Error: undefined is not a valid uri or options object.
    at request (C:\Users\dashboard-master\node_modules\request\
index.js:45:11)
    at visitPage (C:\Users\dashboard-master\config\crawlqueue.j
s:41:3)
    at crawl (C:\Users\dashboard-master\config\crawlqueue.js:30
:5)
    at Request._callback (C:\Users\dashboard-master\config\crawlqueue.js:45:8)
    at Request.self.callback (C:\Users\dashboard-master\node_modules\request\request.js:188:22)
    at emitTwo (events.js:106:13)
    at Request.emit (events.js:191:7)
    at Request.<anonymous> (C:\Users\dashboard-master\node_modules\request\request.js:1171:10)
    at emitOne (events.js:96:13)
    at Request.emit (events.js:188:7)

1 个答案:

答案 0 :(得分:1)

  

它返回状态代码403

ally.com支持Akamai Ghost服务器,Akamai会以某种方式阻止抓取并提供错误参考。您可以在响应正文中检查此内容或在 X-Reference-Error 对我来说,它看起来像 18.5fcxx917.148981xxxx.dacxsd6 。如果您想深入挖掘,可以查看其API以翻译错误参考here

  

并说未定义

首先在进行请求调用时检查错误。您正在直接检查response.statusCode您不知道您是否收到回复或未定义的值。

在您的情况下,如果您没有crawl成功,则表示您没有 nextPages 进行抓取

200

这里你弹出一个空数组(pagesToVisit是空的,因为你没有收集任何链接)所以var nextPage = pagesToVisit.pop(); 将是 nextPage ,然后你传递相同的as undefined请求模块使请求模块抛出错误。

当你有一个长度为&gt;的数组时,你可以做的就是弹出0或检查nextPage值,如下

uri