我可以使用rvest包并使用以下代码来抓取此page的第一个表:
library(rvest)
library(magrittr)
urlbbref <- read_html("http://www.baseball-reference.com/bio/Venezuela_born.shtml")
Bat <- urlbbref %>%
html_node(xpath = '//*[(@id = "bio_batting")]') %>%
html_table()
但是我无法抓住这个页面的第二个表格。我使用 selectorgadget 来查找两个表的xpath,并在代码中使用该信息,但它似乎不适用于第二个。
Pit <- urlbbref %>%
html_node(xpath = '//*[(@id = "div_bio_pitching")]') %>%
html_table()
答案 0 :(得分:2)
我总共拿出3张桌子。
library(magrittr)
library(rvest)
library(xml2)
library(stringi)
urlbbref <- read_html("http://www.baseball-reference.com/bio/Venezuela_born.shtml")
# First table is in the markup
table_one <- xml_find_all(urlbbref, "//table") %>% html_table
# Additional tables are within the comment tags, ie <!-- tables -->
# Which is why your xpath is missing them.
# First get the commented nodes
alt_tables <- xml2::xml_find_all(urlbbref,"//comment()") %>% {
#Find only commented nodes that contain the regex for html table markup
raw_parts <- as.character(.[grep("\\</?table", as.character(.))])
# Remove the comment begin and end tags
strip_html <- stringi::stri_replace_all_regex(raw_parts, c("<\\!--","-->"),c("",""),
vectorize_all = FALSE)
# Loop through the pieces that have tables within markup and
# apply the same functions
lapply(grep("<table", strip_html, value = TRUE), function(i){
rvest::html_table(xml_find_all(read_html(i), "//table")) %>%
.[[1]]
})
}
# Put all the data frames into a list.
all_tables <- c(
table_one, alt_tables
)
> Map(str, all_tables)
'data.frame': 361 obs. of 27 variables:
$ Rk : int 1 2 3 4 5 6 7 8 9 10 ...
$ Name : chr "Bobby Abreu" "Ehire Adrianza" "Jesus Aguilar" "Edgardo Alfonzo" ...
$ Yrs : int 18 4 4 12 6 7 1 5 5 2 ...
$ From : int 1996 2013 2014 1995 2006 2011 2000 2011 2013 2002 ...
$ To : int 2014 2016 2017 2006 2011 2017 2000 2015 2017 2004 ...
$ ASG : int 2 0 0 1 0 4 0 1 0 0 ...
$ G : int 2425 154 47 1506 193 842 2 92 150 38 ...
$ PA : int 10081 331 89 6108 624 3708 5 109 3 75 ...
$ AB : int 8480 291 81 5385 591 3411 5 94 2 64 ...
$ R : int 1453 27 4 777 44 456 1 5 0 11 ...
$ H : int 2470 64 18 1532 142 1062 1 22 0 16 ...
$ 2B : int 574 16 3 282 24 208 0 4 0 4 ...
$ 3B : int 59 1 0 18 3 19 0 0 0 0 ...
$ HR : int 288 3 0 146 17 60 0 1 0 2 ...
$ RBI : int 1363 26 8 744 67 326 0 9 0 10 ...
$ SB : int 400 4 0 53 1 204 0 0 0 1 ...
$ CS : int 128 4 0 17 2 59 0 0 0 0 ...
$ BB : int 1476 23 6 596 17 214 0 1 1 7 ...
$ SO : int 1840 60 28 617 158 389 1 34 0 12 ...
$ BA : num 0.291 0.22 0.222 0.284 0.24 0.311 0.2 0.234 0 0.25 ...
$ OBP : num 0.395 0.292 0.281 0.357 0.271 0.354 0.2 0.237 0.333 0.324 ...
$ SLG : num 0.475 0.313 0.259 0.425 0.377 0.436 0.2 0.309 0 0.406 ...
$ OPS : num 0.87 0.605 0.54 0.782 0.648 0.791 0.4 0.546 0.333 0.731 ...
$ Birthdate : chr "Mar 11, 1974" "Aug 21, 1989" "Jun 30, 1990" "Nov 8, 1973" ...
$ Debut : chr "Sep 1, 1996" "Sep 8, 2013" "May 15, 2014" "Apr 26, 1995" ...
$ Birthplace: chr "Maracay, Aragua" "Guarenas, Miranda" "Maracay, Aragua" "Santa Teresa del Tuy, Miranda" ...
$ Pos : chr "POS" "POS" "POS" "POS" ...
'data.frame': 157 obs. of 31 variables:
$ Rk : int 1 2 3 4 5 6 7 8 9 10 ...
$ Name : chr "Henderson Alvarez" "Jose Alvarez" "Wilson Alvarez" "Alexi Amarista" ...
$ Yrs : int 5 5 14 7 5 2 10 4 6 4 ...
$ From : int 2011 2013 1989 2011 1980 2015 1999 2007 2012 2005 ...
$ To : int 2015 2017 2005 2017 1984 2016 2008 2011 2017 2009 ...
$ ASG : int 1 0 1 0 0 0 0 0 0 0 ...
$ W : int 27 6 102 0 9 4 53 1 15 3 ...
$ L : int 34 12 92 0 6 2 65 3 6 4 ...
$ W-L% : num 0.443 0.333 0.526 NA 0.6 0.667 0.449 0.25 0.714 0.429 ...
$ ERA : num 3.8 3.97 3.96 0 3.27 4.35 4.65 5.28 2.91 6.86 ...
$ G : int 92 150 355 2 110 72 185 43 275 25 ...
$ GS : int 92 6 263 0 0 0 167 0 0 8 ...
$ GF : int 0 32 18 2 66 14 7 16 36 12 ...
$ CG : int 5 0 12 0 0 0 0 0 0 0 ...
$ SHO : int 5 0 5 0 0 0 0 0 0 0 ...
$ SV : int 0 0 4 0 7 0 0 0 0 0 ...
$ IP : num 563 167.2 1747.2 0.2 220 ...
$ H : int 596 174 1624 0 222 64 891 57 177 68 ...
$ R : int 261 85 857 0 86 39 519 29 75 51 ...
$ ER : int 238 74 769 0 80 30 478 27 72 46 ...
$ HR : int 54 17 190 0 17 5 122 7 10 4 ...
$ BB : int 129 55 805 0 68 36 431 21 80 34 ...
$ IBB : int 7 10 29 0 7 3 41 5 17 1 ...
$ SO : int 296 148 1330 0 113 63 680 41 180 37 ...
$ HBP : int 22 8 50 0 3 2 51 4 11 4 ...
$ BK : int 3 1 4 0 3 1 6 0 3 1 ...
$ WP : int 16 3 28 0 5 2 43 1 14 2 ...
$ BF : int 2358 729 7518 2 928 285 4055 221 913 282 ...
$ Birthdate : chr "Apr 18, 1990" "May 6, 1989" "Mar 24, 1970" "Apr 6, 1989" ...
$ Debut : chr "Aug 10, 2011" "Jun 9, 2013" "Jul 24, 1989" "Apr 26, 2011" ...
$ Birthplace: chr "Valencia, Carabobo" "Barcelona, Anzoategui" "Maracaibo, Zulia" "Barcelona, Anzoategui" ...
'data.frame': 3 obs. of 17 variables:
$ Rk : int 1 2 NA
$ Mgr : chr "Ozzie Guillen" "Al Pedrique" "Totals"
$ Yrs : int 9 1 10
$ From : int 2004 2004 2004
$ To : int 2012 2004 2012
$ W : int 747 22 769
$ L : int 710 61 771
$ W-L% : num 0.513 0.265 0.499
$ Ties : int 0 0 0
$ G>.500 : int 37 -39 -2
$ G : int 1457 83 1540
$ BestFin : int 1 5 1
$ WrstFin : int 5 5 5
$ AvRk : num 2.7 5 2.8
$ Birthdate : chr "Jan 20, 1964" "Aug 11, 1960" ""
$ Debut : chr "Apr 9, 1985" "Apr 14, 1987" ""
$ Birthplace: chr "Ocumare del Tuy, Miranda" "Valencia, Carabobo" ""