Webscrape使用RVest

时间:2019-01-23 04:57:27

标签: r web-scraping rvest

我正在尝试从here抓取游泳,骑车,跑步和总时间的数据。

使用选择器小工具查找游泳时间时,结果为character(空)。

library(rvest)
library(xml2)

url <- "http://m.ironman.com/triathlon/events/americas/ironman-70.3/pucon/results.aspx" 
html <- read_html(url)

swim_time <- html  %>%
  html_nodes('td:nth-child(6)') %>% 
  html_text()

感谢您的帮助。

2 个答案:

答案 0 :(得分:0)

library(rvest)
library(dplyr)

url <- "http://m.ironman.com/triathlon/events/americas/ironman-70.3/pucon/results.aspx" 
df <- read_html(url) %>% html_nodes(
  #xpath = "//*[@id=\"eventResults\"]",
  css = "#eventResults") %>% html_table() %>% `[[`(1)
df = df %>% select(Name,Swim,Bike,Run,Finish)

答案 1 :(得分:0)

url <- "http://m.ironman.com/triathlon/events/americas/ironman-70.3/pucon/results.aspx" 
html <- read_html(url)

df <- html  %>%
html_nodes('div') %>% 
html_nodes(xpath = '//comment()') %>% 
html_text() %>%    # extract comment text
paste(collapse = '') %>%    # collapse to a single string
read_html() %>% 
html_nodes('tr') %>% 
html_text %>% 
str_trim() %>% 
str_remove_all(' ') %>% 
as.data.frame()

names(df) <- 'All'
df <- df  %>% separate(All, c('last', 'first', 'swim', 'bike', 'run', 'div', 'gender', 'overall'), sep = '\r\n') %>%
separate(last, c('last', 'first'), sep = ',') %>% 
mutate(first = gsub('[0-9]', '', first)) %>%
mutate(swim = hms(swim), bike = hms(bike), run = hms(run)) %>% 
mutate(total = hms(swim + bike + run, roll = TRUE))