使用RVest将字段添加到已刮除表列表中

时间:2018-12-18 00:30:36

标签: r web-scraping dplyr rvest

我已经成功地(在SO用户的帮助下)抓取了想要的数据,但是我缺少每个抓取表中的数据代表谁的关键。因此,我尝试使用mutate添加一个名为player的字段,该字段与player [[j]]相同,但是在列表上不起作用。我已经读过有关lapply的内容,并尝试也没有成功。有关如何实现此目标的任何建议?

library(rvest)
library(plyr)
library(dplyr)
library(tidyr)


### get a list of players
page <- (0:18)
urls <- list()
for (i in 1:length(page)) {
  url<- paste0("https://www.mlssoccer.com/players?page=",page[i])
  urls[[i]] <- url
}

tbl <- list()
j <- 1
for (j in seq_along(urls)) {
  tbl[[j]] <- urls[[j]] %>%   
    read_html() %>% 
    html_nodes("a.name_link") %>%
    html_text()
  j <- j+1
  if (j == length(urls)) break
}

### join all of the names into one data frame
tbl <- ldply(tbl, data.frame)


player_tb<- as.data.frame(lapply(tbl, tolower))
colnames(player_tb) <- 'name'
player_table<- as.list(gsub(" ", "-", player_tb$name)) 
colnames(player_table) <- 'player'

#### using a list of players, get the game summary for each regular    season game, adding the player name to the table
pages<- list()
for( i in seq_along(player_table)) {
  page <- paste0("https://www.mlssoccer.com/players/",player_table[i])
  pages[[i]] <- page
}


player_stats <- list()
j <- 1
for (j in seq_along(pages)) {
  player_stats[[j]] <- pages[[j]] %>%   
    read_html() %>% 
    html_nodes("table") %>%
    html_table() %>%
    mutate(player = player)  ## this is the piece that fails
  j <- j+1                   
  if (j == length(pages)) break
}

t <- do.call(rbind, player_stats)

2 个答案:

答案 0 :(得分:2)

您可以尝试使用purrr软件包来避免for循环并加快速度

使用purrr,您还将拥有这些非常酷的功能safelypossiblyquietly。一些玩家没有统计信息,您的代码将失败。现在不会了

这个想法是在一个大数据框中收集所有统计数据,并在其中包含一个带有玩家姓名的标识符列

library(rvest)
library(tidyverse)

# lets assume 3 pages only to do it quickly
page <- (0:2)

# no need to create a list. Just a vector
urls = paste0("https://www.mlssoccer.com/players?page=", page)

# define this function that collects the player's name from a url
get_the_names = function( url){
  url %>% 
    read_html() %>% 
    html_nodes("a.name_link") %>% 
    html_text()
}

# map the urls to the function that gets the names
players = map(urls, get_the_names) %>% 
  # turn into a single character vector
  unlist() %>% 
  # make lower case
  tolower() %>% 
  # replace the `space` to underscore
  str_replace_all(" ", "-")


# Now create a vector of player urls
player_urls = paste0("https://www.mlssoccer.com/players/", players )

# define a function that reads the 3rd table of the url
get_the_summary_stats <-  function(url){

  url %>% 
    read_html() %>% 
    html_nodes("table") %>% 
    html_table() %>% .[[3]] 
}

# lets read 3 players only to speed things up [otherwise it takes a significant amount of time to run...]
a_few_players = player_urls[1:3]

# get the stats 
tables = a_few_players %>% 
  # important step so I can name the rows I get in the table
  set_names() %>% 
  #map the player urls to the function that reads the 3rd table
  # note the `safely` wrap around the get_the_summary_stats' function
  # since there are players with no stats and causes an error (eg.brenden-aaronson )
  # the output will be a list of lists [result and error]
  map(., safely(get_the_summary_stats)) %>% 
  # collect only the `result` output (the table) INTO A DATA FRAME
  # There is also an `error` output
  # also, name each row with the players name
  map_df("result", .id = "player") %>% 
  #keep only the player name (remove the www.mls.... part)
  mutate(player = str_replace(player, "https://www.mlssoccer.com/players/", "")) %>% 
  as_tibble()

让我们看看有多少人

  tables %>% count(player)

# A tibble: 2 x 2
  player                n
  <chr>             <int>
1 anatole-abang        81
2 saad-abdul-salaam   136

现在您可以按播放器名称过滤数据框

  tables %>% 
  filter(player == "anatole-abang")

# A tibble: 81 x 14
   player        Date       Match      Result Appearance  MINS     G     A  SHTS   SOG    FC    FS     Y     R
   <chr>         <chr>      <chr>      <chr>  <chr>      <int> <int> <int> <int> <int> <int> <int> <int> <int>
 1 anatole-abang 10/28/2018 ORL @ RBNY W 0-1  Unused Sub     0     0     0     0     0     0     0     0     0
 2 anatole-abang 10/21/2018 RBNY @ PHI W 1-0  Unused Sub     0     0     0     0     0     0     0     0     0
 3 anatole-abang 10/06/2018 RBNY @ SJ  W 3-1  Unused Sub     0     0     0     0     0     0     0     0     0
 4 anatole-abang 9/30/2018  ATL @ RBNY W 0-2  Unused Sub     0     0     0     0     0     0     0     0     0
 5 anatole-abang 9/22/2018  TOR @ RBNY W 0-2  Unused Sub     0     0     0     0     0     0     0     0     0
 6 anatole-abang 9/16/2018  RBNY @ DC  T 3-3  Unused Sub     0     0     0     0     0     0     0     0     0
 7 anatole-abang 9/01/2018  RBNY @ MTL L 0-3  Unused Sub     0     0     0     0     0     0     0     0     0
 8 anatole-abang 8/29/2018  HOU @ RBNY W 0-1  Unused Sub     0     0     0     0     0     0     0     0     0
 9 anatole-abang 8/26/2018  DC @ RBNY  W 0-1  Unused Sub     0     0     0     0     0     0     0     0     0
10 anatole-abang 8/22/2018  RBNY @ NYC T 1-1  Unused Sub     0     0     0     0     0     0     0     0     0
# ... with 71 more rows

答案 1 :(得分:0)

您遇到的问题是由于玩家状态返回了4个单独的表而不是一个。
我已经稍微简化了您的代码,但这不是最终的解决方案,因为最终结果是列表列表。现在,您可以在最终列表上使用lapply来收集每个单独的表,并在需要时将它们合并。

library(rvest)
library(dplyr)
library(tidyr)

### get a list of players
page <- (0:18)
urls<- paste0("https://www.mlssoccer.com/players?page=",page)

tbl <- list()
for (j in seq_along(urls)) {
  tbl[[j]] <- urls[j] %>%   
    read_html() %>% 
    html_nodes("a.name_link") %>%
    html_text()
#add a delay so not to overwhelm server
 Sys.sleep(0.75)
}

### join all of the names into one data frame
player_tb<- tolower(unlist(tbl))
player_table <-data.frame(player= gsub(" ", "-", player_tb))

#### using a list of players, get the game summary for each regular    season game, adding the player name to the table
pages <- paste0("https://www.mlssoccer.com/players/",player_table$player)

player_stats <- list()
for (j in seq_along(pages)) {
  player_stats[[j]] <- pages[j] %>%   
    read_html() %>%   
    html_nodes("table") %>%
    html_table() 
  #determine if the status are present
  #bind player name to the table 
  if (length(ttables)==4){
    player_stats[[j]]<-cbind(player_table$player[j], ttables[[3]])
  } else {
    player_stats[[j]]<-cbind(player_table$player[j], ttables[[1]])
  }
  #add a delay so not to overwhelm server
  #get up and stretch your legs!
  Sys.sleep(0.75)  
}
#combine all of the player status into one dataframe
finalanswer<-do.call(rbind, player_stats)

此代码假定播放状态具有1或4个与之相关的表,如果不正确,则需要更改if / else语句以进行匹配。
希望这对您有帮助。