无法使用rvest在页面中刮取第二个表

时间:2017-04-18 15:51:09

标签: r xpath web-scraping rvest

我可以使用rvest包并使用以下代码来抓取此page的第一个表:

library(rvest)
library(magrittr)

urlbbref <- read_html("http://www.baseball-reference.com/bio/Venezuela_born.shtml")

Bat <- urlbbref %>%
  html_node(xpath = '//*[(@id = "bio_batting")]') %>%
  html_table()

但是我无法抓住这个页面的第二个表格。我使用 selectorgadget 来查找两个表的xpath,并在代码中使用该信息,但它似乎不适用于第二个。

Pit <- urlbbref %>%
  html_node(xpath = '//*[(@id = "div_bio_pitching")]') %>%
  html_table()

1 个答案:

答案 0 :(得分:2)

我总共拿出3张桌子。

library(magrittr)
library(rvest)
library(xml2)
library(stringi)

urlbbref <- read_html("http://www.baseball-reference.com/bio/Venezuela_born.shtml")
# First table is in the markup
table_one <- xml_find_all(urlbbref, "//table") %>% html_table

# Additional tables are within the comment tags, ie <!-- tables -->
# Which is why your xpath is missing them.
# First get the commented nodes
alt_tables <- xml2::xml_find_all(urlbbref,"//comment()") %>% {
#Find only commented nodes that contain the regex for html table markup
  raw_parts <- as.character(.[grep("\\</?table", as.character(.))])
# Remove the comment begin and end tags
  strip_html <- stringi::stri_replace_all_regex(raw_parts, c("<\\!--","-->"),c("",""),
                                                vectorize_all = FALSE)
# Loop through the pieces that have tables within markup and 
# apply the same functions
  lapply(grep("<table", strip_html, value = TRUE), function(i){
    rvest::html_table(xml_find_all(read_html(i), "//table")) %>% 
      .[[1]]
  })
}
# Put all the data frames into a list.
all_tables <- c(
  table_one, alt_tables
)

结果:

> Map(str, all_tables)
'data.frame':   361 obs. of  27 variables:
 $ Rk        : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Name      : chr  "Bobby Abreu" "Ehire Adrianza" "Jesus Aguilar" "Edgardo Alfonzo" ...
 $ Yrs       : int  18 4 4 12 6 7 1 5 5 2 ...
 $ From      : int  1996 2013 2014 1995 2006 2011 2000 2011 2013 2002 ...
 $ To        : int  2014 2016 2017 2006 2011 2017 2000 2015 2017 2004 ...
 $ ASG       : int  2 0 0 1 0 4 0 1 0 0 ...
 $ G         : int  2425 154 47 1506 193 842 2 92 150 38 ...
 $ PA        : int  10081 331 89 6108 624 3708 5 109 3 75 ...
 $ AB        : int  8480 291 81 5385 591 3411 5 94 2 64 ...
 $ R         : int  1453 27 4 777 44 456 1 5 0 11 ...
 $ H         : int  2470 64 18 1532 142 1062 1 22 0 16 ...
 $ 2B        : int  574 16 3 282 24 208 0 4 0 4 ...
 $ 3B        : int  59 1 0 18 3 19 0 0 0 0 ...
 $ HR        : int  288 3 0 146 17 60 0 1 0 2 ...
 $ RBI       : int  1363 26 8 744 67 326 0 9 0 10 ...
 $ SB        : int  400 4 0 53 1 204 0 0 0 1 ...
 $ CS        : int  128 4 0 17 2 59 0 0 0 0 ...
 $ BB        : int  1476 23 6 596 17 214 0 1 1 7 ...
 $ SO        : int  1840 60 28 617 158 389 1 34 0 12 ...
 $ BA        : num  0.291 0.22 0.222 0.284 0.24 0.311 0.2 0.234 0 0.25 ...
 $ OBP       : num  0.395 0.292 0.281 0.357 0.271 0.354 0.2 0.237 0.333 0.324 ...
 $ SLG       : num  0.475 0.313 0.259 0.425 0.377 0.436 0.2 0.309 0 0.406 ...
 $ OPS       : num  0.87 0.605 0.54 0.782 0.648 0.791 0.4 0.546 0.333 0.731 ...
 $ Birthdate : chr  "Mar 11, 1974" "Aug 21, 1989" "Jun 30, 1990" "Nov 8, 1973" ...
 $ Debut     : chr  "Sep 1, 1996" "Sep 8, 2013" "May 15, 2014" "Apr 26, 1995" ...
 $ Birthplace: chr  "Maracay, Aragua" "Guarenas, Miranda" "Maracay, Aragua" "Santa Teresa del Tuy, Miranda" ...
 $ Pos       : chr  "POS" "POS" "POS" "POS" ...
'data.frame':   157 obs. of  31 variables:
 $ Rk        : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Name      : chr  "Henderson Alvarez" "Jose Alvarez" "Wilson Alvarez" "Alexi Amarista" ...
 $ Yrs       : int  5 5 14 7 5 2 10 4 6 4 ...
 $ From      : int  2011 2013 1989 2011 1980 2015 1999 2007 2012 2005 ...
 $ To        : int  2015 2017 2005 2017 1984 2016 2008 2011 2017 2009 ...
 $ ASG       : int  1 0 1 0 0 0 0 0 0 0 ...
 $ W         : int  27 6 102 0 9 4 53 1 15 3 ...
 $ L         : int  34 12 92 0 6 2 65 3 6 4 ...
 $ W-L%      : num  0.443 0.333 0.526 NA 0.6 0.667 0.449 0.25 0.714 0.429 ...
 $ ERA       : num  3.8 3.97 3.96 0 3.27 4.35 4.65 5.28 2.91 6.86 ...
 $ G         : int  92 150 355 2 110 72 185 43 275 25 ...
 $ GS        : int  92 6 263 0 0 0 167 0 0 8 ...
 $ GF        : int  0 32 18 2 66 14 7 16 36 12 ...
 $ CG        : int  5 0 12 0 0 0 0 0 0 0 ...
 $ SHO       : int  5 0 5 0 0 0 0 0 0 0 ...
 $ SV        : int  0 0 4 0 7 0 0 0 0 0 ...
 $ IP        : num  563 167.2 1747.2 0.2 220 ...
 $ H         : int  596 174 1624 0 222 64 891 57 177 68 ...
 $ R         : int  261 85 857 0 86 39 519 29 75 51 ...
 $ ER        : int  238 74 769 0 80 30 478 27 72 46 ...
 $ HR        : int  54 17 190 0 17 5 122 7 10 4 ...
 $ BB        : int  129 55 805 0 68 36 431 21 80 34 ...
 $ IBB       : int  7 10 29 0 7 3 41 5 17 1 ...
 $ SO        : int  296 148 1330 0 113 63 680 41 180 37 ...
 $ HBP       : int  22 8 50 0 3 2 51 4 11 4 ...
 $ BK        : int  3 1 4 0 3 1 6 0 3 1 ...
 $ WP        : int  16 3 28 0 5 2 43 1 14 2 ...
 $ BF        : int  2358 729 7518 2 928 285 4055 221 913 282 ...
 $ Birthdate : chr  "Apr 18, 1990" "May 6, 1989" "Mar 24, 1970" "Apr 6, 1989" ...
 $ Debut     : chr  "Aug 10, 2011" "Jun 9, 2013" "Jul 24, 1989" "Apr 26, 2011" ...
 $ Birthplace: chr  "Valencia, Carabobo" "Barcelona, Anzoategui" "Maracaibo, Zulia" "Barcelona, Anzoategui" ...
'data.frame':   3 obs. of  17 variables:
 $ Rk        : int  1 2 NA
 $ Mgr       : chr  "Ozzie Guillen" "Al Pedrique" "Totals"
 $ Yrs       : int  9 1 10
 $ From      : int  2004 2004 2004
 $ To        : int  2012 2004 2012
 $ W         : int  747 22 769
 $ L         : int  710 61 771
 $ W-L%      : num  0.513 0.265 0.499
 $ Ties      : int  0 0 0
 $ G>.500    : int  37 -39 -2
 $ G         : int  1457 83 1540
 $ BestFin   : int  1 5 1
 $ WrstFin   : int  5 5 5
 $ AvRk      : num  2.7 5 2.8
 $ Birthdate : chr  "Jan 20, 1964" "Aug 11, 1960" ""
 $ Debut     : chr  "Apr 9, 1985" "Apr 14, 1987" ""
 $ Birthplace: chr  "Ocumare del Tuy, Miranda" "Valencia, Carabobo" ""