我正在努力从Alberta Electric System Operator网站(AESO Site)下载(理想情况下是csv,但我也可以处理html格式)数据。通过填写表单然后单击“确定”单选按钮来访问数据。
我尝试使用rvest和curl来尝试访问它,但是已经碰壁了。问题似乎是servlet位于框架内
我认为这与我使用getForm一样接近:
url <- "http://ets.aeso.ca/ets_web/docroot/Market/Reports/HistoricalReportsStart.html"
if(url.exists(url))
postForm(url,
SelectFormat = "html",
SelectReport = "--- Metered Volumes (All)",
BeginMonth = 12,
BeginDay = 12,
BeginYear =2016,
EndMonth = 12,
EndDay =13,
EndYear =2016,
radiobutton = "OK",submit = "OK", style = "POST")
test<-getForm(url, .params = fd)
并且,我也尝试过使用rvest:
s <- html_session(url)
f0 <- html_form(s)
然而,这似乎是我收到错误的地方,因为表单位于一个框架内,所以我在f0中没有内容。
非常感谢任何帮助。
答案 0 :(得分:1)
直接点击报告:
library(httr)
library(rvest)
library(stringi)
library(tidyverse)
get_metered_volumes_report <- function(start_date, end_date) {
start_date <- as.Date(start_date)
end_date <- as.Date(end_date)
GET(
url = "http://ets.aeso.ca/ets_web/ip/Market/Reports/PublicSummaryAllReportServlet",
query = list(
beginDate = format(start_date, "%m%d%Y"),
endDate = format(end_date, "%m%d%Y"),
contentType = "csv"
)
) -> res
stop_for_status(res)
# Neither the CSV nor HTML output is all that great but the CSV
# can be made to work with (IMO) less effort than the HTML. You may
# need to do some extra checks for data format (for either CSV or
# HTML), though, in "production" mode.
# From what I saw in the output, you likely need to modify
# this attempt at munging since the "hours" seem off, but you
# at least now have the data.
content(res, as="text") %>%
stri_split_lines() %>%
flatten_chr() ->
read.csv(
text = paste0(c(paste0(l[8:9], collapse=","), l[11:length(l)]), collapse="\n"),
header = TRUE, stringsAsFactors=FALSE
) %>% janitor::clean_names() %>%
tbl_df()
}
示例:
xdf <- get_metered_volumes_report("2016-12-12", "2016-12-13")
xdf
## # A tibble: 2,877 x 30
## pool_participant_id asset_type asset_id x x_1 x_2 hour_1 hour_2 hour_3 hour_4 hour_5 hour_6 hour_7 hour_8 hour_9 hour_10 hour_11
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 - - - 28.40 23.07 21.41 22.22 23.78 37.37 38.94 39.97 46.00 47.26 38.49 42.51 41.15 43.91
## 2 4285 IPP 42G1 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 3 9496 RETAILER 941A 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 4 9496 RETAILER 941C 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 5 9496 RETAILER 941E 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 6 9496 RETAILER 941F 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 7 9496 RETAILER 941L 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 8 9496 RETAILER 941P 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 9 9496 RETAILER 941R 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## 10 9496 RETAILER 941U 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
## # ... with 2,867 more rows, and 13 more variables: hour_12 <chr>, hour_13 <chr>, hour_14 <chr>, hour_15 <chr>, hour_16 <chr>, hour_17 <chr>,
## # hour_18 <chr>, hour_19 <chr>, hour_20 <chr>, hour_21 <chr>, hour_22 <lgl>, hour_23 <lgl>, hour_24 <lgl>
和
glimpse(xdf)
## Observations: 2,877
## Variables: 30
## $ pool_participant_id <chr> "-", "4285", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9558", "9558", "9558", "95...
## $ asset_type <chr> "-", "IPP", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RET...
## $ asset_id <chr> "-", "42G1", "941A", "941C", "941E", "941F", "941L", "941P", "941R", "941U", "941X", "G035", "G036", "951A", "95...
## $ x <chr> "28.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_1 <chr> "23.07", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_2 <chr> "21.41", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_1 <chr> "22.22", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_2 <chr> "23.78", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_3 <chr> "37.37", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_4 <chr> "38.94", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_5 <chr> "39.97", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_6 <chr> "46.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_7 <chr> "47.26", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_8 <chr> "38.49", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_9 <chr> "42.51", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_10 <chr> "41.15", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_11 <chr> "43.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_12 <chr> "46.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_13 <chr> "45.73", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_14 <chr> "49.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_15 <chr> "34.90", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_16 <chr> "25.82", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_17 <chr> "24.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_18 <chr> "25.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_19 <chr> "27.99", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_20 <chr> "29.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_21 <chr> "24.27", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_22 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_23 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_24 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
您也可以定位其他报告网址: