所以我只是想为ATP(网球)排名构建一个网络抓取工具。我使用以下代码进行操作:
let cheerio = require('cheerio');
let request = require('request');
let rp = require('request-promise');
class DataApi {
static playerMapping(player) {
const mapping = {
'Roger Federer': 'https://www.atpworldtour.com/en/players/roger-federer/f324/rankings-history',
'Nick Kyrgios': 'https://www.atpworldtour.com/es/players/nick-kyrgios/ke17/rankings-history'
};
return mapping[player];
}
constructor() {
this.rankings = {};
this.ranking = this.ranking.bind(this);
this.ranks = this.ranks.bind(this);
this.getRankings = this.getRankings.bind(this);
this.ranks();
}
getRankings() {
return this.rankings;
}
ranks(player = 'Nick Kyrgios') {
// configuring options for request call
let options = {
uri: DataApi.playerMapping(player), // uri to request
transform: function(body) {
let rankings = {};
let $ = cheerio.load(body); //setting up cheerio to parse html
process.stdout.write('loading');
$('#playerRankHistoryContainer > table > tbody > tr').each((index, element) => { // parsing the html with the correct selector
process.stdout.write('.');
let row = $(element).find('td');
let date = '';
row.each((index, element) => {
if (index === 1) {
let data = parseInt($(element).html().replace(/^\s+|\s+$/g, '').replace(/(?:\r\n|\r|\n)/g, '').replace('T', '')); // ignore this regex
let incData = {
date: date,
data: data
};
rankings[incData.date] = incData.data;
} else if (index === 0) {
date = $(element).html().replace(/^\s+|\s+$/g, '').replace(/(?:\r\n|\r|\n)/g, '').replace(/\./g, '/'); // ignore this regex too
}
});
});
return rankings; // returned scraped data to be processed in the Promise resolution (request-promise call returns a Promise)
}
};
let reqRes = rp(options).then(
(data) => {
this.rankings = data;
});
if (reqRes) {
return 0;
} else {
return -1;
}
}
}
const api = new DataApi();
console.log(api.getRankings()); // default player is Nick Kyrgios
api.ranks('Roger Federer'); // now fetching rankings for Roger Federer
console.log(api.getRankings());
刮板有效,但是一段时间后它将返回数据。这就是我在努力的目标。在底部的console.log()
调用中,它们返回{}
,因为尚未恢复数据。解决这样的数据的最佳实践是什么?我正在尝试尽可能地阅读request
模块文档,但我想我也可以在这里尝试一下运气。