总之,我现在有一些问题从多个网页抓取数据。
library(RCurl)
library(XML)
tables <- readHTMLTable(getURL("https://www.basketball-reference.com/leagues/NBA_2018_games.html"))
for (i in c("october", "november", "december", "january")) {
readHTMLTable(getURL(paste0("https://www.basketball-reference.com/leagues/NBA_2018_games-",i,".html")))
regular <- tables[["schedule"]]
write.csv(regular, file = paste0("./", i, i, ".csv"))
}
我遇到了一个问题,它似乎没有在几个月内循环,只是从10月份开始保存4个文件。 任何帮助赞赏。
答案 0 :(得分:0)
这不是最优雅的方式,但效果很好。
希望能帮到你。rm(list = ls())
if(!require("rvest")){install.packages("rvest");library("rvest")}
for (i in c("october", "november", "december", "january")) {
nba_url <- read_html(paste0("https://www.basketball-reference.com/leagues/NBA_2018_games-",i,".html"))
#Left part of the table
left<-nba_url %>%
html_nodes(".left") %>% #item de precios
html_text()
left<-left[-length(left)]
left<-left[-(1:4)]
#Assign specific values
Date<-left[seq(1,length(left),4)]
Visitor<-left[seq(2,length(left),4)]
Home<-left[seq(3,length(left),4)]
#Right part of the table
right<-nba_url %>%
html_nodes(".right") %>% #item de precios
html_text()
right<-right[-length(right)]
right<-right[-(1:2)]
#Assign specific values
Start<-right[seq(1,length(right),3)]
PTS1<-right[seq(2,length(right),3)]
PTS2<-right[seq(3,length(right),3)]
nba_data<-data.frame(Date,Start,Visitor,PTS1,Home,PTS2)
write.csv(nba_data, file = paste0("./", i, i, ".csv"))
}
答案 1 :(得分:0)
这是使用tidyvere废弃本网站的解决方案。但首先我们检查网站的robots.txt文件,以了解请求的限制率。有关详细信息,请参阅帖子Analyzing “Crawl-Delay” Settings in Common Crawl robots.txt Data with R以供参考。
library(spiderbar)
library(robotstxt)
rt <- robxp(get_robotstxt("https://www.basketball-reference.com"))
crawl_delays(rt)
#> agent crawl_delay
#> 1 * 3
#> 2 ahrefsbot -1
#> 3 twitterbot -1
#> 4 slysearch -1
#> 5 ground-control -1
#> 6 groundcontrol -1
#> 7 matrix -1
#> 8 hal9000 -1
#> 9 carmine -1
#> 10 the-matrix -1
#> 11 skynet -1
我们对*
值感兴趣。我们看到我们必须在请求之间等待至少3秒。我们将花5个时间。
我们使用tidyverse
生态系统来构建网址并迭代它们以获取包含所有数据的表格。
library(tidyverse)
library(rvest)
#> Le chargement a nécessité le package : xml2
#>
#> Attachement du package : 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
month_sub <- c("october", "november", "december", "january")
urls <- map_chr(month_sub, ~ paste0("https://www.basketball-reference.com/leagues/NBA_2018_games-", .,".html"))
urls
#> [1] "https://www.basketball-reference.com/leagues/NBA_2018_games-october.html"
#> [2] "https://www.basketball-reference.com/leagues/NBA_2018_games-november.html"
#> [3] "https://www.basketball-reference.com/leagues/NBA_2018_games-december.html"
#> [4] "https://www.basketball-reference.com/leagues/NBA_2018_games-january.html"
pb <- progress_estimated(length(urls))
map(urls, ~{
url <- .
pb$tick()$print()
Sys.sleep(5) # we take 5sec
tables <- read_html(url) %>%
# we select the table part by its table id tag
html_nodes("#schedule") %>%
# we extract the table
html_table() %>%
# we get a 1 element list so we take flatten to get a tibble
flatten_df()
}) -> tables
# we get a list of tables, one per month
str(tables, 1)
#> List of 4
#> $ :Classes 'tbl_df', 'tbl' and 'data.frame': 104 obs. of 8 variables:
#> $ :Classes 'tbl_df', 'tbl' and 'data.frame': 213 obs. of 8 variables:
#> $ :Classes 'tbl_df', 'tbl' and 'data.frame': 227 obs. of 8 variables:
#> $ :Classes 'tbl_df', 'tbl' and 'data.frame': 216 obs. of 8 variables:
# we can get all the data in one table by binding rows.
# As we saw on the website that there are 2 empty columns with no names,
# we need to take care of it with repair_name before row binding
res <- tables %>%
map_df(tibble::repair_names)
res
#> # A tibble: 760 x 8
#> Date `Start (ET)` `Visitor/Neutral` PTS
#> <chr> <chr> <chr> <int>
#> 1 Tue, Oct 17, 2017 8:01 pm Boston Celtics 102
#> 2 Tue, Oct 17, 2017 10:30 pm Houston Rockets 121
#> 3 Wed, Oct 18, 2017 7:30 pm Milwaukee Bucks 100
#> 4 Wed, Oct 18, 2017 8:30 pm Atlanta Hawks 111
#> 5 Wed, Oct 18, 2017 7:00 pm Charlotte Hornets 102
#> 6 Wed, Oct 18, 2017 7:00 pm Brooklyn Nets 140
#> 7 Wed, Oct 18, 2017 8:00 pm New Orleans Pelicans 103
#> 8 Wed, Oct 18, 2017 7:00 pm Miami Heat 116
#> 9 Wed, Oct 18, 2017 10:00 pm Portland Trail Blazers 76
#> 10 Wed, Oct 18, 2017 10:00 pm Houston Rockets 100
#> # ... with 750 more rows, and 4 more variables: `Home/Neutral` <chr>,
#> # V1 <chr>, V2 <chr>, Notes <lgl>