我使用" request"异步获取大量数据。在Node.js。
首先,请求回调不包含原始请求post参数(这很糟糕)所以我必须通过在请求中添加x-header来实现这一点(因为原始请求中的头可用于回调响应)。
但是,请求标头通常与响应结果不匹配。即我快速连续请求两个网址,回调会将它们混合起来。它会声称一个请求的请求标头属于另一个请求的结果。
var getActiveGames = function() {
console.log(activeGamePlayer);
if (activeGamePlayer >= Object.keys(players).length-1) {
activeGamePlayer = 0;
}
var player = Object.keys(players)[activeGamePlayer];
var ign = players[player].ign;
if (ign) {
request.post({
headers: {
'content-type' : 'application/x-www-form-urlencoded',
'x-summoner' : player
},
url: URL,
body: 'userName=' + ign + '&force=true'
}, function(error, response, body){
if (!error) {
if (response.statusCode == 200) {
jsdom.env(body, ['http://code.jquery.com/jquery.js'], function(err, window) {
if (window.jQuery) {
activeGamePlayer += 1;
var $ = window.jQuery;
var isPlaying = $('div:first').hasClass('SpectatorBig');
//var playerID = response.client._httpMessage._headers['x-summoner']; //doesn't work due to request and response headers mismatch
var playername = isPlaying ? $('tr.mine .summonerName').html() : $('div.nBoxContent b').html();
if (playername) {
var playerID;
for (var p in players) {
if (players[p].ign) {
if (players[p].ign == playername) {
playerID = p;
}
}
}
if (isPlaying) {
var champion = $('tr.mine div.__spc32').removeClass('__spc32 img').attr('class');
champion = champion.replace('__spc32-', '');
var gameURL = $('div.Spectate a').attr('href');
var timestamp;
if ($('._countdown').length) {
timestamp = $('._countdown').attr('data-timestamp');
} else {
timestamp =$('._timeago').attr('data-datetime');
}
players[playerID].activeGame = {
'timestamp' : timestamp,
'champion' : champion,
'finished' : false,
};
} else {
if (players[playerID].hasOwnProperty('activeGame')) {
players[playerID].activeGame.finished = true;
// make sure no game is in players json object
}
}
} else {
console.log ("Error: Faulty data from op.gg");
faultyIGN = Object.keys(players).length;
}
} else {
console.log("Error: No jQuery object in jsdom body.");
}
});
} else {
console.log("GetActiveGame: Status Code not 200");
}
} else {
console.log("GetActiveGame: " + error);
}
setTimeout(getActiveGames, 100);
});
} else {
//no ign
activeGamePlayer += 1;
setTimeout(getActiveGames, 100);
}
}
问题是此实例中的响应对象包含不正确的信息。它包含的请求和响应信息不匹配,它们来自不同的请求和响应。
答案 0 :(得分:0)
http://blog.miguelgrinberg.com/post/easy-web-scraping-with-nodejs
诀窍是将回调包装在一个自动执行的函数中,该函数传递了您想要保存的信息。然后,此信息可用于回调函数闭包。
for (pool in pools) {
var url = 'http://www.thprd.org/schedules/schedule.cfm?cs_id=' + pools[pool];
request(url, ( function(pool) {
return function(err, resp, body) {
if (err)
throw err;
$ = cheerio.load(body);
console.log(pool);
// TODO: scraping goes here!
}
} )(pool));
}