我能够抓取像nature.com,flipkart.com这样的网站。它运作得很好。但是当我试图抓住ally.com时,nike.com。它返回状态代码403并表示未定义。这是我的代码
// crawlerqueue.js
var request = require('request');
var cheerio = require('cheerio');
var URL = require('url-parse');
var pa11y=require('pa11y');
var START_URL = "http://www.nature.com/";
//var SEARCH_WORD = "stemming";
var MAX_PAGES_TO_VISIT = 100;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var url = new URL(START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
/*var isWordFound = searchForWord($, SEARCH_WORD);
if(isWordFound) {
console.log('Word ' + SEARCH_WORD + ' found at page ' + url);
} else*/ {
collectInternalLinks($);
// In this short program, our callback is just calling crawl()
callback();
}
});
}
function searchForWord($, word) {
var bodyText = $('html > body').text().toLowerCase();
return(bodyText.indexOf(word.toLowerCase()) !== -1);
}
function collectInternalLinks($) {
var relativeLinks = $("a[href^='/']");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + $(this).attr('href'));
});
}
我通过命令行运行此代码。 nature.com的输出如下:
Visiting page http://www.nature.com/
Status code: 200
Found 23 relative links on page
Visiting page http://www.nature.com/scitable/sponsors
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/pressnews
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/contact
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/about
Status code: 200
Found 25 relative links on page
Visiting page http://www.nature.com/scitable/my-profile/social-settings
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/photocredit
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/presscontact
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/presskit
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/pressroom
Status code: 200
Found 26 relative links on page
Visiting page http://www.nature.com/scitable/sponsorship
Status code: 200
Found 22 relative links on page
Visiting page http://www.nature.com/scitable/topicpage/copy-number-
Status code: 200
Found 89 relative links on page
Reached max limit of number of pages to visit.
但是当我尝试抓取nike.com或ally.com时,我会看到以下错误
Visiting page http://www.ally.com
Status code: 403
Visiting page undefined
C:\Users\dashboard-master\node_modules\request\index.js:45
throw new Error('undefined is not a valid uri or options object.')
^
Error: undefined is not a valid uri or options object.
at request (C:\Users\dashboard-master\node_modules\request\
index.js:45:11)
at visitPage (C:\Users\dashboard-master\config\crawlqueue.j
s:41:3)
at crawl (C:\Users\dashboard-master\config\crawlqueue.js:30
:5)
at Request._callback (C:\Users\dashboard-master\config\crawlqueue.js:45:8)
at Request.self.callback (C:\Users\dashboard-master\node_modules\request\request.js:188:22)
at emitTwo (events.js:106:13)
at Request.emit (events.js:191:7)
at Request.<anonymous> (C:\Users\dashboard-master\node_modules\request\request.js:1171:10)
at emitOne (events.js:96:13)
at Request.emit (events.js:188:7)
答案 0 :(得分:1)
它返回状态代码403
ally.com
支持Akamai Ghost服务器,Akamai会以某种方式阻止抓取并提供错误参考。您可以在响应正文中检查此内容或在 X-Reference-Error 对我来说,它看起来像 18.5fcxx917.148981xxxx.dacxsd6 。如果您想深入挖掘,可以查看其API以翻译错误参考here。
并说未定义
首先在进行请求调用时检查错误。您正在直接检查response.statusCode
您不知道您是否收到回复或未定义的值。
在您的情况下,如果您没有crawl
成功,则表示您没有 nextPages 进行抓取
200
这里你弹出一个空数组(pagesToVisit是空的,因为你没有收集任何链接)所以var nextPage = pagesToVisit.pop();
将是 nextPage
,然后你传递相同的as undefined
请求模块使请求模块抛出错误。
当你有一个长度为&gt;的数组时,你可以做的就是弹出0或检查nextPage值,如下
uri