通过框架

时间:2017-12-24 22:08:29

标签: r curl rvest httr

我正在努力从Alberta Electric System Operator网站(AESO Site)下载(理想情况下是csv,但我也可以处理html格式)数据。通过填写表单然后单击“确定”单选按钮来访问数据。

我尝试使用rvest和curl来尝试访问它,但是已经碰壁了。问题似乎是servlet位于框架内

我认为这与我使用getForm一样接近:

 url <- "http://ets.aeso.ca/ets_web/docroot/Market/Reports/HistoricalReportsStart.html"

         if(url.exists(url)) 
       postForm(url,
                SelectFormat  = "html",
                SelectReport = "--- Metered Volumes (All)",
                BeginMonth = 12,
                BeginDay = 12,
                BeginYear =2016,
                EndMonth = 12,
                EndDay =13,
                EndYear =2016,
                radiobutton = "OK",submit = "OK", style = "POST")

     test<-getForm(url, .params = fd)

并且,我也尝试过使用rvest:

 s <- html_session(url)
 f0 <- html_form(s)

然而,这似乎是我收到错误的地方,因为表单位于一个框架内,所以我在f0中没有内容。

非常感谢任何帮助。

1 个答案:

答案 0 :(得分:1)

直接点击报告:

library(httr)
library(rvest)
library(stringi)
library(tidyverse)

get_metered_volumes_report <- function(start_date, end_date) {

  start_date <- as.Date(start_date)
  end_date <- as.Date(end_date)

  GET(
    url = "http://ets.aeso.ca/ets_web/ip/Market/Reports/PublicSummaryAllReportServlet",
    query = list(
      beginDate = format(start_date, "%m%d%Y"),
      endDate = format(end_date, "%m%d%Y"),
      contentType = "csv"
    )
  ) -> res

  stop_for_status(res)

  # Neither the CSV nor HTML output is all that great but the CSV
  # can be made to work with (IMO) less effort than the HTML. You may
  # need to do some extra checks for data format (for either CSV or
  # HTML), though, in "production" mode.

  # From what I saw in the output, you likely need to modify 
  # this attempt at munging since the "hours" seem off, but you
  # at least now have the data.

  content(res, as="text") %>% 
    stri_split_lines() %>% 
    flatten_chr() -> 

  read.csv(
    text = paste0(c(paste0(l[8:9], collapse=","), l[11:length(l)]), collapse="\n"),
    header = TRUE, stringsAsFactors=FALSE
  )  %>% janitor::clean_names() %>% 
    tbl_df()

}

示例:

xdf <- get_metered_volumes_report("2016-12-12", "2016-12-13")

xdf
## # A tibble: 2,877 x 30
##    pool_participant_id asset_type asset_id      x    x_1    x_2 hour_1 hour_2 hour_3 hour_4 hour_5 hour_6 hour_7 hour_8 hour_9 hour_10 hour_11
##                  <chr>      <chr>    <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  <chr>   <chr>   <chr>
##  1                   -          -        -  28.40  23.07  21.41  22.22  23.78  37.37  38.94  39.97  46.00  47.26  38.49  42.51   41.15   43.91
##  2                4285        IPP     42G1 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  3                9496   RETAILER     941A 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  4                9496   RETAILER     941C 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  5                9496   RETAILER     941E 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  6                9496   RETAILER     941F 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  7                9496   RETAILER     941L 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  8                9496   RETAILER     941P 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
##  9                9496   RETAILER     941R 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
## 10                9496   RETAILER     941U 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000  0.0000  0.0000
## # ... with 2,867 more rows, and 13 more variables: hour_12 <chr>, hour_13 <chr>, hour_14 <chr>, hour_15 <chr>, hour_16 <chr>, hour_17 <chr>,
## #   hour_18 <chr>, hour_19 <chr>, hour_20 <chr>, hour_21 <chr>, hour_22 <lgl>, hour_23 <lgl>, hour_24 <lgl>

glimpse(xdf)
## Observations: 2,877
## Variables: 30
## $ pool_participant_id <chr> "-", "4285", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9496", "9558", "9558", "9558", "95...
## $ asset_type          <chr> "-", "IPP", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RETAILER", "RET...
## $ asset_id            <chr> "-", "42G1", "941A", "941C", "941E", "941F", "941L", "941P", "941R", "941U", "941X", "G035", "G036", "951A", "95...
## $ x                   <chr> "28.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_1                 <chr> "23.07", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ x_2                 <chr> "21.41", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_1              <chr> "22.22", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_2              <chr> "23.78", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_3              <chr> "37.37", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_4              <chr> "38.94", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_5              <chr> "39.97", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_6              <chr> "46.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_7              <chr> "47.26", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_8              <chr> "38.49", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_9              <chr> "42.51", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_10             <chr> "41.15", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_11             <chr> "43.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_12             <chr> "46.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_13             <chr> "45.73", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_14             <chr> "49.95", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_15             <chr> "34.90", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_16             <chr> "25.82", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_17             <chr> "24.00", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_18             <chr> "25.91", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_19             <chr> "27.99", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_20             <chr> "29.40", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_21             <chr> "24.27", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0.0000", "0....
## $ hour_22             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_23             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ hour_24             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...

您也可以定位其他报告网址:

enter image description here