我在节点JS中遇到了一些爬虫问题。事实上,当我为Vinted网站午餐时,我有一个错误。 但是我有很多时间错误: 首先,我有这个:
Error: getaddrinfo ENOTFOUND www.vinted.fr www.vinted.fr:443
然后
...
Error: read ECONNRESET
Error: read ECONNRESET
Error: read ECONNRESET
Error: read ECONNRESET
Error: read ECONNRESET
...
有时候
Error: socket hang up
Error: socket hang up
Error: socket hang up
但是我的爬虫工作并返回一些产品的好结果,并在10分钟后停止。 我想是因为我发送了太多的请求,但我需要它......所以,它可能是一个网络问题
我完全陷入了所有这些错误它可以解决它吗?
感谢您的帮助。
这是我的代码:
fs.readFile(__dirname +'/link.json', 'utf8', function (err, data) {
var obj;
if (err) throw err;
obj = JSON.parse(data);
urlp = obj.link;
console.log(colors.yellow("Products:"+urlp.length));
for(i = 1; i < urlp.length-1; i++){
url = 'https://www.vinted.fr'+urlp[i-1];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var link = [];
var json = { link : ""};
var price = $('span[itemprop=price]').text();
var format_price = price.replace(/\n|\r/g,"");
var format_price2 = format_price.replace(/ /g,"");
var res1 = $('.details-list--details');
var meta = $("link[rel='canonical']").attr('href');
var images = []; // tableau img
$('img[itemprop=image]').filter(function(){
var img = $(this).attr('data-src');;
images.push(img);
})
// var imageshow = console.log(colors.rainbow(images .join(", ")));
var brand = $('.inverse > [itemprop=name]').text();
var state = $('div[itemprop=itemCondition]').text();
var color = $('div[itemprop=color]').text();
console.log(urlp[i]);
var token_vendu = $('.state-bar').text();
if(token_vendu != ""){
console.log(colors.red('PRODUIT VENDU'));
var vendu = 1;
}else{
vendu = 0;
}
console.log(colors.blue("CallBack Vente "+vendu));
var discount_price = $('.old-price').text();
console.log("Discount: " + discount_price);
try{
if(brand == ""){
var size = res1.children().parent().text();
var format_size = size.replace(/ /g,"");
var format_size2 = format_size.replace(/[\n]/gi, " " );
var split_size1 = format_size2.split(" ");
var split_size2 = split_size1[0].split(" ");
var split4 = split_size2[4];
var formatsize = split4;
}else{
var size = res1.children().parent().children().text();
var format_size = size.replace(/ /g,"");
var format_size2 = format_size.replace(/[\n]/gi, " " );
var split_size = format_size2.split(" ");
console.log("split: "+split_size[1] )
var split3 = split_size[1].split(" ");
formatsize = split3[1];
}
} catch (e) {
split_size[1] = "N/A";
console.log(e.message);
}
console.log("Size : " + formatsize);
console.log("Brand : "+brand);
console.log(meta);
console.log("color : " + color);
console.log("state : " + state);
//Save to database
connection.query('INSERT INTO `vinted` VALUES ( NULL , ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP())',
[brand,
color,
format_price2,
discount_price,
state,
formatsize,
vendu,
images.join(", "),
meta
]
, function (err, result) {
if (err) {console.error('error inserting into database : ' + err.stack); return;}
});
}else{console.log(colors.red(error));} // here the error ECONNRESET/ENOTFOUND ...
答案 0 :(得分:0)
我怀疑您的抓取工具可能会压倒您尝试抓取的网站,因此ECONNRESET。我有自己编写node.js爬虫的类似经历。我不得不通过定时发送请求来自我节制,给服务器一些喘息空间。
示例:
var request = require('request');
var EventEmitter = require('events').EventEmitter;
emitter = new EventEmitter();
function doCrawl(){
setTimeout(function(){
//do crawling operation, e.g.
request(url, function(err, resp, html){
if(!err){
//do all you want with the response then trigger another one
emitter.emit('fetchNext');
}
});
}, 60000);
}
}
emitter.on('fetchNext', doCrawl);
您可能还想考虑这样做
修改强>&GT;&GT;使用示例代码
var urlStack = []; //an array that holds the list of urls you want to visit
var emitter = new EventEmitter();
emitter.on('fetchNext', delayedCrawl); //this is triggered after any item is saved
fs.readFile(__dirname + '/link.json', 'utf8', function(err, data) {
var obj;
if (err) throw err;
obj = JSON.parse(data);
urlp = obj.link;
console.log(colors.yellow("Products:" + urlp.length));
for (i = 1; i < urlp.length - 1; i++) {
urlStack.push('https://www.vinted.fr' + urlp[i - 1];
}
emmiter.emit('fetchNext');
});
function delayedCrawl(){
setTimeout(doCrawl, 5000); //5-second delay
}
function doCrawl() {
var url = urlStack.pop();
if(!url) return;
request(url, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
var link = [];
var json = {
link: ""
};
var price = $('span[itemprop=price]').text();
var format_price = price.replace(/\n|\r/g, "");
var format_price2 = format_price.replace(/ /g, "");
var res1 = $('.details-list--details');
var meta = $("link[rel='canonical']").attr('href');
var images = []; // tableau img
$('img[itemprop=image]').filter(function() {
var img = $(this).attr('data-src');;
images.push(img);
})
// var imageshow = console.log(colors.rainbow(images .join(", ")));
var brand = $('.inverse > [itemprop=name]').text();
var state = $('div[itemprop=itemCondition]').text();
var color = $('div[itemprop=color]').text();
console.log(url);
var token_vendu = $('.state-bar').text();
if (token_vendu != "") {
console.log(colors.red('PRODUIT VENDU'));
var vendu = 1;
} else {
vendu = 0;
}
console.log(colors.blue("CallBack Vente " + vendu));
var discount_price = $('.old-price').text();
console.log("Discount: " + discount_price);
try {
if (brand == "") {
var size = res1.children().parent().text();
var format_size = size.replace(/ /g, "");
var format_size2 = format_size.replace(/[\n]/gi, " ");
var split_size1 = format_size2.split(" ");
var split_size2 = split_size1[0].split(" ");
var split4 = split_size2[4];
var formatsize = split4;
} else {
var size = res1.children().parent().children().text();
var format_size = size.replace(/ /g, "");
var format_size2 = format_size.replace(/[\n]/gi, " ");
var split_size = format_size2.split(" ");
console.log("split: " + split_size[1])
var split3 = split_size[1].split(" ");
formatsize = split3[1];
}
} catch (e) {
split_size[1] = "N/A";
console.log(e.message);
}
console.log("Size : " + formatsize);
console.log("Brand : " + brand);
console.log(meta);
console.log("color : " + color);
console.log("state : " + state);
//Save to database
connection.query('INSERT INTO `vinted` VALUES ( NULL , ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP())', [brand,
color,
format_price2,
discount_price,
state,
formatsize,
vendu,
images.join(", "),
meta
], function(err, result) {
emitter.emit('fetchNext');
if (err) {
console.error('error inserting into database : ' + err.stack);
return;
}
});
} else {
console.log(colors.red(error));
} // here the error ECONNRESET/ENOTFOUND ...