我正在尝试从以下站点抓取 NBA 每日 ROS 预测:https://hashtagbasketball.com/fantasy-basketball-projections。
问题是默认选择的玩家数量是 200,我想要 400(或者全部都可以)。
此代码检索前 200 个没问题:
> url <- 'https://hashtagbasketball.com/fantasy-basketball-projections'
>
> page <- read_html(url)
>
> projs <- html_table(page)[[3]] %>% ### anything after this just cleans the df
+ rename_all(~gsub('3pm','threes',gsub('\\%','pct',tolower(.)))) %>%
+ mutate_at(vars(matches('pct$')),~stringr::str_sub(.,1,4)) %>%
+ mutate(player = stringr::word(player,1, 2, sep = ' ')) %>%
+ mutate(pos = stringr::word(pos,1,1,sep = ',')) %>%
+ mutate(pos2 = gsub('P','',pos)) %>%
+ drop_na(player) %>%
+ mutate_at(vars(-c(player,matches('pos'),team)),~as.numeric(.)) %>%
+ select(player, matches('pos'),everything(),-`r#`) %>%
+ head(2)
> projs
player pos pos2 team gp mpg fgpct ftpct threes pts treb ast stl blk to total
1 James Harden PG G HOU 64 36.3 0.44 0.86 4.7 34.4 6.6 9.3 1.7 0.8 4.6 17.68
2 Anthony Davis PF F LAL 65 34.8 0.50 0.84 1.3 26.6 9.4 3.2 1.5 2.3 2.5 14.56
这会创建一个包含所有所需类别的表格。但是,当我使用下面的代码时并没有提取所有的统计类别(只有 gp 和 mpg):
> pgsession <- html_session(url)
> pgform <-html_form(pgsession)[[1]]
> filled_form <-set_values(pgform,
+ "ctl00$ContentPlaceHolder1$DDSHOW" = "400")
>
> d <- submit_form(session=pgsession, form=filled_form)
Submitting with '<unnamed>'
>
> y <- d %>%
+ html_nodes("table") %>%
+ .[[3]] %>%
+ html_table(header=TRUE) %>%
+ mutate(PLAYER = stringr::word(PLAYER,1, 2, sep = ' ')) %>%
+ head(2)
> y
R# PLAYER POS TEAM GP MPG TOTAL
1 1 James Harden PG,SG HOU 64 36.3 0.00
2 2 Anthony Davis PF,C LAL 65 34.8 0.00
知道我做错了什么吗? 谢谢
答案 0 :(得分:3)
问题似乎是在您提交表单时未选中其他变量的复选框。您必须手动设置它们。这向您展示了如何获得 ftm
和 ftpct
。剩下的就交给你了:
library(tidyverse)
library(rvest)
url <- 'https://hashtagbasketball.com/fantasy-basketball-projections'
pgsession <- html_session(url)
pgform <-html_form(pgsession)
pgform[[1]][[5]][["ctl00$ContentPlaceHolder1$CBFTM"]]$value <- "checked"
pgform[[1]][[5]][["ctl00$ContentPlaceHolder1$CBFTP"]]$value <- "checked"
filled_form <-set_values(pgform[[1]],"ctl00$ContentPlaceHolder1$DDSHOW" = "400")
d <- submit_form(session=pgsession, form=filled_form)
d %>%
html_nodes("table") %>%
.[[3]] %>%
html_table() %>%
rename_all(~gsub('3pm','threes',gsub('\\%','pct',tolower(.)))) %>%
mutate_at(vars(matches('pct$')),~stringr::str_sub(.,1,4)) %>%
mutate(player = stringr::word(player,1, 2, sep = ' ')) %>%
mutate(pos = stringr::word(pos,1,1,sep = ',')) %>%
mutate(pos2 = gsub('P','',pos)) %>%
drop_na(player) %>%
mutate_at(vars(-c(player,matches('pos'),team)),~as.numeric(.)) %>%
select(player, matches('pos'),everything(),-`r#`) %>%
head(2)
# player pos pos2 team gp mpg ftm ftpct total
#1 James Harden PG G HOU 64 36.3 10.4 0.86 10.95
#2 Devin Booker SG SG PHX 70 35.6 6.7 0.91 7.99