Question

我使用node.js编写一个爬虫函数。在这个函数中，我使用http模块来请求一个固定的网站，并使用setInterval（）使爬虫函数每秒运行一次。

问题是当函数运行一段时间后，函数的每个请求都会超时并且没有正确的请求！

有一种现象，我有两个url需要在testconfig.js中抓取，“cp.360.cn/ssccq/”请求数据的长度是130961，“www.xjflcp.com / ssc /“请求数据的长度为39509;但是cp.360.cn会出现超时问题，并且“www.xjflcp.com”没有出现！

现在我的解决方案是当我遇到超时然后重新启动此程序时！谁可以给我一个最好的解决方案？

以下是Code，两个文件：testdata.js和testconfig.js

//testdata.js  Main file
var http=require('http'),
url=require('url'),
querystring=require('querystring'),
config=require('./testconfig.js');//config file

// process the uncaughtExceptio
process.on('uncaughtException', function(e){
	console.log(e.stack);
});

//request per second
config.cp.forEach(function(conf){
	setInterval(run, 1*1000, conf);
});

//http rewrite
http.request=(function(_request){
	return function(options,callback){
		var timeout=options['timeout'],
			timeoutEventId;
		var req=_request(options,function(res){
			res.on('end',function(){
				clearTimeout(timeoutEventId);
			});
			
			res.on('close',function(){
				clearTimeout(timeoutEventId);
			});
				
			callback(res);
		});
		
		//timeout
		req.on('timeout',function(){
			log("timeout");
			req.end();
		})
		timeout && (timeoutEventId=setTimeout(function(){
			req.emit('timeout',{message:'have been timeout...'});
		},timeout));
		return req;
	};
})(http.request);

function run(conf){	
	var option=JSON.parse(JSON.stringify(conf.option));
	option.path+='?'+(new Date()).getTime();	
	http.request(option, function(res){		
		var data="";
		res.on("data", function(_data){
			data+=_data.toString();
		});		
		res.on("end", function(){
			console.log(conf.title+ data.length + "have crawl the data");		
		});		
		res.on("error", function(err){
			log('reson---'+err);
		});		
	}).on('timeout', function(err){
		log('crawl timeout');
	}).on("error", function(err){	
		log('reqon---'+err);	
	}).end();
}

// testconfig.js  
exports.cp=[	
	{
		title:'chongqing ssc',
		source:'360CaiPiaowang',
		name:'cqssc',
		enable:true,
		timer:'cqssc', 

		option:{
			host:"cp.360.cn",
			timeout:50000,
			path: '/ssccq/',
			headers:{
				"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/22.0.1271.64 Safari/537.11"
			}
		},
		parse:function(str){
			try{
				return getFrom360CP(str,1);
			}catch(err){
			}
		}
	},////////////
		
	//{{{
	{
		title:'xinjiang ssc',
		source:'cai lele',
		name:'xjssc',
		enable:true,
		timer:'xjssc',

		option:{
			host:"www.xjflcp.com",
			timeout:50000,
			path: '/ssc/',
			headers:{
				"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/26.0.1271.64 Safari/537.11"
			}
		},
		
		parse:function(str){
			return getFromXJFLCPWeb(str,12);
		}
	},
	//}}}
];


// log print
global.log=function(log){
	var date=new Date();
	console.log('['+date.toDateString() +' '+ date.toLocaleTimeString()+'] '+log);
	
}

// www.xjflcp.com parse regulation
function getFromXJFLCPWeb(str, type){
	str=str.substr(str.indexOf('<td><a href="javascript:detatilssc'), 300).replace(/[\r\n]+/g,'');
         
	var reg=/(\d{10}).+(\d{2}\:\d{2}).+<p>([\d ]{9})<\/p>/,
	match=str.match(reg);
	
	if(!match) throw new Error('data is not correct');
		
	try{
		var data={
			type:type,
			time:match[1].replace(/^(\d{4})(\d{2})(\d{2})\d{2}/, '$1-$2-$3 ')+match[2],
			number:match[1].replace(/^(\d{8})(\d{2})$/, '$1-$2'),
			data:match[3].split(' ').join(',')
		};
		return data;
	}catch(err){
		throw('parse data error ');
	}
}

// cp.360.cn parse regulation
function getFrom360CP(str, type){
	str=str.substr(str.indexOf('<em class="red" id="open_issue">'),380);
	var reg=/[\s\S]*?(\d+)<\/em>[\s\S].*?<ul id="open_code_list">((?:[\s\S]*?<li class=".*?">\d+<\/li>){3,5})[\s\S]*?<\/ul>/,
	match=str.match(reg);
	var myDate = new Date();
	var year = myDate.getFullYear();       //year   
    var month = myDate.getMonth() + 1;     //month 
    var day = myDate.getDate();            //day
	if(month < 10) month="0"+month;
	if(day < 10) day="0"+day;
	var mytime=year + "-" + month + "-" + day + " " +myDate.toLocaleTimeString();
	
	if(match.length>1){
		
		if(match[1].length==7) match[1]=year+match[1].replace(/(\d{4})(\d{3})/,'$1-$2');
		if(match[1].length==8) match[1]='20'+match[1].replace(/(\d{6})(\d{2})/,'$1-0$2');
		if(match[1].length==9) match[1]='20'+match[1].replace(/(\d{6})(\d{2})/,'$1-$2');
		if(match[1].length==10) match[1]=match[1].replace(/(\d{8})(\d{2})/,'$1-0$2');
		var mynumber=match[1].replace(/(\d{8})(\d{3})/,'$1-$2');
		
		try{
			var data={
				type:type,
				time:mytime,
				number:mynumber
			}
			
			reg=/<li class=".*?">(\d+)<\/li>/g;
			data.data=match[2].match(reg).map(function(v){
				var reg=/<li class=".*?">(\d+)<\/li>/;
				return v.match(reg)[1];
			}).join(',');
			
			return data;
		}catch(err){
			throw('parse data error ');
		}
	}
}

大量的http.request导致nodejs中的每个请求超时

0 个答案: