使用请求api处理我的网络抓取数据的最佳方法

时间:2018-06-24 01:22:28

标签: javascript request cheerio request-promise

所以我只是想为ATP(网球)排名构建一个网络抓取工具。我使用以下代码进行操作:

let cheerio = require('cheerio');
let request = require('request');
let rp = require('request-promise');


class DataApi {

    static playerMapping(player) {
        const mapping = {
            'Roger Federer': 'https://www.atpworldtour.com/en/players/roger-federer/f324/rankings-history',
            'Nick Kyrgios': 'https://www.atpworldtour.com/es/players/nick-kyrgios/ke17/rankings-history'

        };
        return mapping[player];
    }

    constructor() {
        this.rankings = {};
        this.ranking = this.ranking.bind(this);
        this.ranks = this.ranks.bind(this);
        this.getRankings = this.getRankings.bind(this);
        this.ranks();
    }


    getRankings() {
        return this.rankings;
    }



    ranks(player = 'Nick Kyrgios') {

        // configuring options for request call
        let options = {
            uri: DataApi.playerMapping(player), // uri to request
            transform: function(body) {
                let rankings = {};
                let $ = cheerio.load(body); //setting up cheerio to parse html
                process.stdout.write('loading');
                $('#playerRankHistoryContainer > table > tbody > tr').each((index, element) => { // parsing the html with the correct selector
                    process.stdout.write('.');
                    let row = $(element).find('td');
                    let date = '';
                    row.each((index, element) => {
                        if (index === 1) {
                            let data = parseInt($(element).html().replace(/^\s+|\s+$/g, '').replace(/(?:\r\n|\r|\n)/g, '').replace('T', '')); // ignore this regex
                            let incData = {
                                date: date,
                                data: data
                            };
                            rankings[incData.date] = incData.data;
                        } else if (index === 0) {
                            date = $(element).html().replace(/^\s+|\s+$/g, '').replace(/(?:\r\n|\r|\n)/g, '').replace(/\./g, '/'); // ignore this regex too
                        }
                    });
                });
                return rankings; // returned scraped data to be processed in the Promise resolution (request-promise call returns a Promise)
            }
        };
        let reqRes = rp(options).then(
            (data) => {
                this.rankings = data;
            });
        if (reqRes) {
            return 0;
        } else {
            return -1;
        }
    }
}

const api = new DataApi();
console.log(api.getRankings()); // default player is Nick Kyrgios
api.ranks('Roger Federer'); // now fetching rankings for Roger Federer
console.log(api.getRankings());

刮板有效,但是一段时间后它将返回数据。这就是我在努力的目标。在底部的console.log()调用中,它们返回{},因为尚未恢复数据。解决这样的数据的最佳实践是什么?我正在尝试尽可能地阅读request模块文档,但我想我也可以在这里尝试一下运气。

0 个答案:

没有答案