我正在尝试运行一个.js文件,该文件应该允许我抓取每个html页面并收集网站的所有href属性。它仅适用于极少数网站(例如:flipkart.com)。无法抓取大多数网站,例如(ally.com,nature.com,nike.com)这是我的代码
crawler.js
'use strict';
var fs = require('fs');
var request = require('request');
var url = require('url');
var cheerio = require('cheerio');
var csv = require('csv-parser');
var json2csv = require('json2csv');
var pa11y =require('pa11y');
/**
* Crawler for microsites. Collects all internal URLs it can find starting from a
* base URL.
*
* queued: add if not exists
* crawled: add if crawled
* to pick URL to crawl, it loops through "queue", and skip ones found in "crawled"
* {
* queue: [
* URL1,
* URL2,
* URL3
* ],
* crawled: [
* URL1,
* URL2
* ]
* }
*/
function Crawler(baseUrl, abbreviation) {
this.queue = [];
this.crawled = [];
this.baseUrl = baseUrl;
this.abbreviation = abbreviation;
}
/**
* Triggers the crawling actions.
*/
Crawler.prototype.startCrawl = function(callback) {
var self = this;
this.cb = callback;
// Crawl base URL.
this.crawl(self.baseUrl);
};
/**
* Main crawler function.
*
* Grabs all URLs from links on a page, places them in the queue, and triggers
* crawlNext().
*/
Crawler.prototype.crawl = function(crawlUrl) {
var self = this;
// Crawl URL.
console.log('Crawl: ' + crawlUrl);
request(crawlUrl, function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var crawledUrl = '';
$('href').each(function(i, element) {
// Create absolute paths.
crawledUrl = url.resolve(crawlUrl, $(element).attr('href'));
// Filter internal URLs, non-hash URLs.**/
// You can add more restricions here if necessary.
if (crawledUrl.indexOf(crawlUrl) === 0 && crawledUrl.indexOf('#') === -1) {
// Push to queue if it isn't already in it.
if (self.queue.indexOf(crawledUrl) === -1) {
self.queue.push(crawledUrl);
}
}
});
// Add to crawled URLs list.
self.crawled.push(crawlUrl);
// Crawl next page.
self.crawlNext();
}
});
};
/**
* Crawl next URL from the queue.
*/
Crawler.prototype.crawlNext = function() {
var self = this;
var completed = true;
// Loop over queued URLs.
for (var i = 0; i < self.queue.length; i++) {
// If URL in queue hasn't been crawled yet.
if (self.crawled.indexOf(self.queue[i]) === -1 ) {
self.crawl(self.queue[i]);
completed = false;
break;
}
}
// Crawl completed.
if (completed === true) {
this.cb(self.crawled);
}
};
/**
* Create an iterator from an array.
*/
function makeIterator(array){
var nextIndex = 0;
return {
next: function() {
return nextIndex < array.length ? {value: array[nextIndex++], done: false} : {done: true};
}
};
}
/**
* Read all microsites and iterate the crawler function over it.
*
* The CSV file needs two columns: one named "url", containing the base URLs,
* the second one named "abbreviation", which represents a unique identifier
* for the URL.
*/
var microsites = [];
fs.createReadStream('C:/Users/microsites.csv')
.pipe(csv())
.on('data', function(data) {
// Collect URLs
microsites.push(data);
}).on('end', function () {
// Create iterator out of microsites array.
var iterator = makeIterator(microsites);
// Start crawling.
crawlMicrosite(iterator);
});
/**
* Checks the iterator and triggers a microsite crawl.
*/
function crawlMicrosite(iterator) {
// Crawl next microsite in iterator object.
var microsite = iterator.next();
if (microsite.done !== true) {
//console.log('Kalpana'+microsite.value.abbreviation);
var abbrCode = microsite.value.abbreviation.trim().replace(/\s+/g, '-').toLowerCase();
//console.log('Test'+abbrCode);
// Check if file exists.
fs.access('C:/Users/microsite-' + abbrCode + '.csv', fs.F_OK, function(err) {
if (err) {
// if it doesn't exist, we can crawl.
//console.log('Kalpanadevi'+microsite.value.url);
var crawler = new Crawler(microsite.value.url, microsite.value.abbreviation);
crawler.startCrawl(function(results) {
// Add to total URL collection.
var urlCollection = [];
for (var i = 0; i < results.length; i++) {
urlCollection.push({
'micrositeId': abbrCode,
'baseurl': microsite.value.url,
'url': results[i],
'abbreviation': microsite.value.abbreviation
});
}
// Store CSV.
storeCSV(urlCollection, abbrCode);
// Crawl next microsite.
crawlMicrosite(iterator);
});
} else {
// Move on if it isn't accessible.
console.log('File exists. Skip crawling ' + microsite.value.url);
crawlMicrosite(iterator);
}
});
}
else {
console.log('All scraping done!');
}
}
/**
* Store crawl results in CSV file.
*/
function storeCSV(urlCollection, abbrCode) {
var fields = ['baseurl', 'url', 'abbreviation'];
var csvObject = json2csv({ data: urlCollection, fields: fields });
fs.writeFile('C:/Users/microsite-' + abbrCode + '.csv', csvObject, function(err) {
if (err) throw err;
console.log('File saved!');
});
}
crawler.js文件将从microsites.CSV文件中读取网址
"url","abbreviation"
"https://www.flipkart.com","flip"
它应该抓取网站,但我会看到以下消息
Crawl: https://www.nature.com
All scraping done!
File saved!
当我检查CSV文件时。爬行没完成。它应该遍历每个HTML页面,并获得结果。
.CSV file
"baseurl","url","abbreviation"
"http://www.nature.com/","http://www.nature.com/","nature"
请在下面找到flipkart.com的示例输出
"baseurl","url","abbreviation"
"https://www.flipkart.com/","https://www.flipkart.com/","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/buy-gift-voucher?otracker=ch_vn_gift-voucher","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/mobile-apps?otracker=ch_vn_mobile_apps","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/s/contact","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/account/orders","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/notifications?otracker=Notifications_view_all","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/account/login?signup=true","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/account/login?ret=/","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/viewcart?otracker=Cart_Icon_Click","flipkart"
"https://www.flipkart.com/","https://www.flipkart.com/offers-list/deals-of-the-day?screen=dynamic&pk=themeViews%3DDOTD%3AdesktopDefaultDealCard~widgetType%3DdealCard&wid=2.dealCard.OMU&otracker=hp_omu_Deals+Of+The+Day_0","flipkart"
我无法获得nature.com的抓取工作。我错过了一些东西。任何有关这方面的建议都会有所帮助。工作的例子会很棒。提前致谢