使用rvest

时间:2017-03-05 10:29:17

标签: r web-scraping rvest

我正试图从2012-2016斯德哥尔摩马拉松比赛中取得成果。我可以使用下面列出的代码这样做,但每次我从一年中删除结果时,我必须经历手动更改URL以捕获下一年的过程。

这让我很困扰,因为唯一需要改变的是http://results.marathon.se/ 2012 /?content = list& event = STHM& num_results的粗体部分= 250安培;页= 1&安培; PID =列表&安培;搜索[性别] = M&安培; LANG = SE

如何修改下面的代码,以便从每年中删除结果,将结果输出到单个数据框中,该数据框还包含一列以指示观察所属的年份?

library(dplyr)
library(rvest)
library(tidyverse)

# Find the total number of pages to scrape
tot_pages <- read_html('http://results.marathon.se/2012/?content=list&event=STHM&num_results=250&page=1&pid=list&search[sex]=M&lang=EN') %>%
  html_nodes('a:nth-child(6)') %>% html_text() %>% as.numeric()

#Store the URLs in a vector
URLs <- sprintf('http://results.marathon.se/2012/?content=list&event=STHM&num_results=250&page=%s&pid=list&search[sex]=M&lang=EN', 1:tot_pages)

#Create a progress bar
pb <- progress_estimated(tot_pages, min = 0)

# Create a function to scrape the name and finishing time from each page
getdata <- function(URL) {
  pb$tick()$print()
  pg <- read_html(URL)
  html_nodes(pg, 'tbody td:nth-child(3)') %>% html_text() %>% as_tibble() %>% set_names(c('Name')) %>%
mutate(finish_time = html_nodes(pg, 'tbody .right') %>% html_text())
}

#Map everything into a dataframe
map_df(URLs, getdata) -> results

1 个答案:

答案 0 :(得分:1)

您可以使用lapply执行此操作:

library(dplyr)
library(rvest)
library(tidyverse)

# make a vector of the years you want
years <- seq(2012,2016)

# now use lapply to iterate your code over those years
Results.list <- lapply(years, function(x) {

  # make a target url with the relevant year
  link <- sprintf('http://results.marathon.se/%s/?content=list&event=STHM&num_results=250&page=1&pid=list&search[sex]=M&lang=EN', x)

  # Find the total number of pages to scrape
  tot_pages <- read_html(link) %>%
    html_nodes('a:nth-child(6)') %>% html_text() %>% as.numeric()

  # Store the URLs in a vector
  URLs <- sprintf('http://results.marathon.se/%s/?content=list&event=STHM&num_results=250&page=%s&pid=list&search[sex]=M&lang=EN', x, 1:tot_pages)

  #Create a progress bar
  pb <- progress_estimated(tot_pages, min = 0)

  # Create a function to scrape the name and finishing time from each page
  getdata <- function(URL) {
    pb$tick()$print()
    pg <- read_html(URL)
    html_nodes(pg, 'tbody td:nth-child(3)') %>% html_text() %>% as_tibble() %>% set_names(c('Name')) %>%
    mutate(finish_time = html_nodes(pg, 'tbody .right') %>% html_text())
  }

  #Map everything into a dataframe
  map_df(URLs, getdata) -> results

  # add an id column indicating which year
  results$year <- x

  return(results)

})

# now collapse the resulting list into one tidy df
Results <- bind_rows(Results.list)