对于一个研究项目,我想从国际足联网站上搜集国际足球(足球)比赛的所有结果。我用R来做这个。但是,似乎包含匹配的表是使用javascript生成的。 这是我想要抓的网址:
http://www.fifa.com/live-scores/international-tournaments/fixtures-results/index.html#month5-2018
我尝试使用phantomjs渲染javascript表后渲染页面但是在生成的html中,仍然没有给出包含匹配结果的表。这是我的代码:
url = "http://www.fifa.com/live-scores/international-
tournaments/fixtures-results/index.html#month5-2018"
writeLines(sprintf("
var page = require('webpage').create();
var fs = require('fs');
var path = 'scrape.html'
page.open('%s', function (status) {
var content = page.content;
fs.write(path, content, 'w')
phantom.exit();
});", url), con="scrape.js")
system("./phantomjs.exe scrape.js")
答案 0 :(得分:2)
在构建完桌面后,您不需要抓取桌面,本网站会对这些端点进行一些调用。
http://data.fifa.com/livescores/en/internationaltournaments/matches/m/byyearandmonth/2018/5
http://data.fifa.com/livescores/live/matches
要查找它们,请使用浏览器上的网络检查器(按f12)。更简单的方法是选择构造这些表的jsons,而不是在构造它们之后选择它们。
编辑:构建表的所有数据都在这些jsons上,以获取数据。首先执行get请求并下载包含这些jsons的网页的内容。当您检查网页的内容时,您会看到“jsons”,但他们已经在一个功能中,只需将其删除即可。
例如,在第一个链接中,您可以删除正在转义json的_matchesByYearAndMonthCallback(
和最后一个)
。
删除后,您将获得一个有效的json,您可以使用包 json.lite 或 json 在 R 内进行解析,请检查文档。在您使用其中一个软件包后,您应该获得一个数据框,您可以选择信息。
您将获得的json的开始示例。
{
"competitionslist": {
"0": {
"name": "Friendlies",
"idCup": 506,
"edition": 1872,
"idCupSeason": 2000010101,
"isFifaCompetition": true,
"countryCode": "",
"cupKindID": 105,
"competitionSeoName": "friendly-506",
"hasStanding": false,
"linkMatches": "",
"linkStanding": "",
"link": "",
"hasMatchLive": false,
"isActiveSeason": true,
"matchlist": [{
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300438343,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43818,
"homeCountryCode": "IRQ",
"homeTeamName": "Iraq",
"idAwayTeam": 43989,
"awayCountryCode": "PLE",
"awayTeamName": "Palestine",
"matchDate": "2018-05-08T16:00:00Z",
"matchDateUTC": "2018-05-08T16:00:00Z",
"kickOffTime": "16:00",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 0,
"venueName": "Basra ",
"competitionSeoName": "friendly-506",
"matchSeoName": "Iraq-Palestine-300438343",
"homeTeamSeoName": "iraq-43818",
"awayTeamSeoName": "palestine-43989",
"hasStanding": false,
"winTeamName": "",
"winTeamShortName": "",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
}, {
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300439349,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43843,
"homeCountryCode": "ALG",
"homeTeamName": "Algeria",
"idAwayTeam": 43835,
"awayCountryCode": "KSA",
"awayTeamName": "Saudi Arabia",
"matchDate": "2018-05-09T19:30:00Z",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 2,
"venueName": "Cadiz ",
"idWinTeam": 43835,
"competitionSeoName": "friendly-506",
"matchSeoName": "Algeria-Saudi Arabia-300439349",
"homeTeamSeoName": "algeria-43843",
"awayTeamSeoName": "saudi-arabia-43835",
"hasStanding": false,
"winTeamName": "Saudi Arabia",
"winTeamShortName": "Saudi Arabia",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
},