我使用node.js编写一个爬虫函数。在这个函数中,我使用http模块来请求一个固定的网站,并使用setInterval()使爬虫函数每秒运行一次。
问题是当函数运行一段时间后,函数的每个请求都会超时并且没有正确的请求!
有一种现象,我有两个url需要在testconfig.js中抓取,“cp.360.cn/ssccq/”请求数据的长度是130961,“www.xjflcp.com / ssc /“请求数据的长度为39509;但是cp.360.cn会出现超时问题,并且“www.xjflcp.com”没有出现!
现在我的解决方案是当我遇到超时然后重新启动此程序时!谁可以给我一个最好的解决方案?
以下是Code,两个文件:testdata.js和testconfig.js
//testdata.js Main file
var http=require('http'),
url=require('url'),
querystring=require('querystring'),
config=require('./testconfig.js');//config file
// process the uncaughtExceptio
process.on('uncaughtException', function(e){
console.log(e.stack);
});
//request per second
config.cp.forEach(function(conf){
setInterval(run, 1*1000, conf);
});
//http rewrite
http.request=(function(_request){
return function(options,callback){
var timeout=options['timeout'],
timeoutEventId;
var req=_request(options,function(res){
res.on('end',function(){
clearTimeout(timeoutEventId);
});
res.on('close',function(){
clearTimeout(timeoutEventId);
});
callback(res);
});
//timeout
req.on('timeout',function(){
log("timeout");
req.end();
})
timeout && (timeoutEventId=setTimeout(function(){
req.emit('timeout',{message:'have been timeout...'});
},timeout));
return req;
};
})(http.request);
function run(conf){
var option=JSON.parse(JSON.stringify(conf.option));
option.path+='?'+(new Date()).getTime();
http.request(option, function(res){
var data="";
res.on("data", function(_data){
data+=_data.toString();
});
res.on("end", function(){
console.log(conf.title+ data.length + "have crawl the data");
});
res.on("error", function(err){
log('reson---'+err);
});
}).on('timeout', function(err){
log('crawl timeout');
}).on("error", function(err){
log('reqon---'+err);
}).end();
}
// testconfig.js
exports.cp=[
{
title:'chongqing ssc',
source:'360CaiPiaowang',
name:'cqssc',
enable:true,
timer:'cqssc',
option:{
host:"cp.360.cn",
timeout:50000,
path: '/ssccq/',
headers:{
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/22.0.1271.64 Safari/537.11"
}
},
parse:function(str){
try{
return getFrom360CP(str,1);
}catch(err){
}
}
},////////////
//{{{
{
title:'xinjiang ssc',
source:'cai lele',
name:'xjssc',
enable:true,
timer:'xjssc',
option:{
host:"www.xjflcp.com",
timeout:50000,
path: '/ssc/',
headers:{
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/26.0.1271.64 Safari/537.11"
}
},
parse:function(str){
return getFromXJFLCPWeb(str,12);
}
},
//}}}
];
// log print
global.log=function(log){
var date=new Date();
console.log('['+date.toDateString() +' '+ date.toLocaleTimeString()+'] '+log);
}
// www.xjflcp.com parse regulation
function getFromXJFLCPWeb(str, type){
str=str.substr(str.indexOf('<td><a href="javascript:detatilssc'), 300).replace(/[\r\n]+/g,'');
var reg=/(\d{10}).+(\d{2}\:\d{2}).+<p>([\d ]{9})<\/p>/,
match=str.match(reg);
if(!match) throw new Error('data is not correct');
try{
var data={
type:type,
time:match[1].replace(/^(\d{4})(\d{2})(\d{2})\d{2}/, '$1-$2-$3 ')+match[2],
number:match[1].replace(/^(\d{8})(\d{2})$/, '$1-$2'),
data:match[3].split(' ').join(',')
};
return data;
}catch(err){
throw('parse data error ');
}
}
// cp.360.cn parse regulation
function getFrom360CP(str, type){
str=str.substr(str.indexOf('<em class="red" id="open_issue">'),380);
var reg=/[\s\S]*?(\d+)<\/em>[\s\S].*?<ul id="open_code_list">((?:[\s\S]*?<li class=".*?">\d+<\/li>){3,5})[\s\S]*?<\/ul>/,
match=str.match(reg);
var myDate = new Date();
var year = myDate.getFullYear(); //year
var month = myDate.getMonth() + 1; //month
var day = myDate.getDate(); //day
if(month < 10) month="0"+month;
if(day < 10) day="0"+day;
var mytime=year + "-" + month + "-" + day + " " +myDate.toLocaleTimeString();
if(match.length>1){
if(match[1].length==7) match[1]=year+match[1].replace(/(\d{4})(\d{3})/,'$1-$2');
if(match[1].length==8) match[1]='20'+match[1].replace(/(\d{6})(\d{2})/,'$1-0$2');
if(match[1].length==9) match[1]='20'+match[1].replace(/(\d{6})(\d{2})/,'$1-$2');
if(match[1].length==10) match[1]=match[1].replace(/(\d{8})(\d{2})/,'$1-0$2');
var mynumber=match[1].replace(/(\d{8})(\d{3})/,'$1-$2');
try{
var data={
type:type,
time:mytime,
number:mynumber
}
reg=/<li class=".*?">(\d+)<\/li>/g;
data.data=match[2].match(reg).map(function(v){
var reg=/<li class=".*?">(\d+)<\/li>/;
return v.match(reg)[1];
}).join(',');
return data;
}catch(err){
throw('parse data error ');
}
}
}