我尝试复制网址进行抓取。它从给定的startDate到endDate循环,这是我的代码;
startDate <- as.Date("01-11-17", format="%d-%m-%y")
endDate <- as.Date("31-01-18",format="%d-%m-%y")
theDay <- startDate
while (theDay <= endDate)
{
dy <- as.character(theDay, format="%d")
month <- as.character(theDay, format = "%m")
year <- as.character(theDay, format ="%Y")
wyoming <- "http://weather.uwyo.edu/cgi-bin/sounding?region=seasia&TYPE=TEXT%3ALIST&YEAR="
address <- paste0(wyoming,year,"&MONTH=",month,"&FROM=",dy,"00&T0=",dy,"00&STNM=48657")
print(address)
theDay = theDay + 1
}
我不太了解html,但是我喜欢这段代码https://stackoverflow.com/a/52539658/7356308如何将数据转换为数据框,以便以后使用。它收集网页响应并将数据提取到实际的列名中。它的工作正常..直到我合并循环任务。说明;
Error in wx_dat[[1]] : subscript out of bounds
对此有何建议...谢谢
library(httr)
library(rvest)
startDate <- as.Date("01-11-17", format="%d-%m-%y")
endDate <- as.Date("31-01-18",format="%d-%m-%y")
theDay <- startDate
while (theDay <= endDate)
{
dy <- as.character(theDay, format="%d")
month <- as.character(theDay, format = "%m")
year <- as.character(theDay, format ="%Y")
httr::GET(
url = "http://weather.uwyo.edu/cgi-bin/sounding",
query = list(
region = "seasia",
TYPE = "TEXT:list",
YEAR = year,
MONTH = month,
FROM = paste0(dy,"00"), #is this the root of problem?
STNM = "48657"
)
) -> res
#becoming html document
httr::content(res, as="parsed") %>% html_nodes("pre")-> wx_dat
#extract data
html_text(wx_dat[[1]]) %>% # turn the first <pre> node into text
strsplit("\n") %>% # split it into lines
unlist() %>% # turn it back into a character vector
{ col_names <<- .[3]; . } %>% # pull out the column names
.[-(1:5)] %>% # strip off the header
paste0(collapse="\n") -> readings # turn it back into a big text blob
readr::read_table(readings, col_names = tolower(unlist(strsplit(trimws(col_names),"\ +"))))
#data <- read_table(readings, col_names = tolower(unlist(strsplit(trimws(col_names),"\ +"))))
#to write csv..
print(theDay)
theDay = theDay + 1
}
答案 0 :(得分:2)
事实证明,有些日子没有车站数据。 O_o
由于您可能真的想分析此数据,而是通过SO交互缓慢地达到了这一点(我不会怪您),因此,将前面的httr
调用包装到一个不错的功能:
#' Get Sounding data for a station via University of Wyoming web portal
#'
#' @md
#' @param region one of "`naconf`", "`samer`", "`pac`", "`nz`", "`ant`", "`np`",
#' "`europe`", "`africa`", "`seasia`", "`mideast`" (which matches the
#' values of the drop-down menu on the site)
#' @param date an ISO character string (e.g. `YYYY-mm-dd`) or a valid `Date` object
#' @param from_hr,to_hr one of `00` (or `0`), `12` or `all`; if `all` then both
#' values will be set to `all`
#' @param station_number the station number
#' @return data frame
#' @export
get_sounding_data <- function(region = c("naconf", "samer", "pac", "nz", "ant",
"np", "europe", "africa", "seasia", "mideast"),
date,
from_hr = c("00", "12", "all"),
to_hr = c("00", "12", "all"),
station_number) {
# we use these pkgs (I removed the readr and dplyr dependencies)
suppressPackageStartupMessages({
require("xml2", quietly = TRUE)
require("httr", quietly = TRUE)
require("rvest", quietly = TRUE)
})
# validate region
region <- match.arg(
arg = region,
choices = c(
"naconf", "samer", "pac", "nz", "ant",
"np", "europe", "africa", "seasia", "mideast"
)
)
# this actually validates the date for us if it's a character string
date <- as.Date(date)
# get year and month
year <- as.integer(format(date, "%Y"))
stopifnot(year %in% 1973:as.integer(format(Sys.Date(), "%Y")))
year <- as.character(year)
month <- format(date, "%m")
# we need these to translate day & *_hr to the param the app needs
c(
"0100", "0112", "0200", "0212", "0300", "0312", "0400", "0412",
"0500", "0512", "0600", "0612", "0700", "0712", "0800", "0812",
"0900", "0912", "1000", "1012", "1100", "1112", "1200", "1212",
"1300", "1312", "1400", "1412", "1500", "1512", "1600", "1612",
"1700", "1712", "1800", "1812", "1900", "1912", "2000", "2012",
"2100", "2112", "2200", "2212", "2300", "2312", "2400", "2412",
"2500", "2512", "2600", "2612", "2700", "2712", "2800", "2812",
"2900", "2912", "3000", "3012", "3100", "3112"
) -> hr_vals
c(
"01/00Z", "01/12Z", "02/00Z", "02/12Z", "03/00Z", "03/12Z", "04/00Z",
"04/12Z", "05/00Z", "05/12Z", "06/00Z", "06/12Z", "07/00Z", "07/12Z",
"08/00Z", "08/12Z", "09/00Z", "09/12Z", "10/00Z", "10/12Z", "11/00Z",
"11/12Z", "12/00Z", "12/12Z", "13/00Z", "13/12Z", "14/00Z", "14/12Z",
"15/00Z", "15/12Z", "16/00Z", "16/12Z", "17/00Z", "17/12Z", "18/00Z",
"18/12Z", "19/00Z", "19/12Z", "20/00Z", "20/12Z", "21/00Z", "21/12Z",
"22/00Z", "22/12Z", "23/00Z", "23/12Z", "24/00Z", "24/12Z", "25/00Z",
"25/12Z", "26/00Z", "26/12Z", "27/00Z", "27/12Z", "28/00Z", "28/12Z",
"29/00Z", "29/12Z", "30/00Z", "30/12Z", "31/00Z", "31/12Z"
) -> hr_inputs
hr_trans <- stats::setNames(hr_vals, hr_inputs)
o_from_hr <- from_hr <- as.character(tolower(from_hr))
o_to_hr <- to_hr <- as.character(tolower(to_hr))
if ((from_hr == "all") || (to_hr == "all")) {
from_hr <- to_hr <- "all"
} else {
from_hr <- hr_trans[sprintf("%s/%02dZ", format(date, "%d"), as.integer(from_hr))]
match.arg(from_hr, hr_vals)
to_hr <- hr_trans[sprintf("%s/%02dZ", format(date, "%d"), as.integer(to_hr))]
match.arg(to_hr, hr_vals)
}
# clean up the station number if it was entered as a double
station_number <- as.character(as.integer(station_number))
# execute the API call
httr::GET(
url = "http://weather.uwyo.edu/cgi-bin/sounding",
query = list(
region = region,
TYPE = "TEXT:LIST",
YEAR = year,
MONTH = sprintf("%02d", as.integer(month)),
FROM = from_hr,
TO = to_hr,
STNM = station_number
)
) -> res
# check for super bad errors (that we can't handle nicely)
httr::stop_for_status(res)
# get the page content
doc <- httr::content(res, as="text")
# if the site reports no data, issue a warning and return an empty data frame
if (grepl("Can't get", doc)) {
doc <- xml2::read_html(doc)
msg <- rvest::html_nodes(doc, "body")
msg <- rvest::html_text(msg, trim=TRUE)
msg <- gsub("\n\n+.*$", "", msg)
warning(msg)
return(data.frame(stringsAsFactors=FALSE))
}
# turn it into something we can parse
doc <- xml2::read_html(doc)
# get the metadata
meta <- rvest::html_node(doc, "h2")
meta <- rvest::html_text(meta, trim=TRUE)
# get the table
doc <- rvest::html_nodes(doc, "pre")[[1]]
doc <- rvest::html_text(doc, trim=TRUE)
doc <- strsplit(doc, "\n")[[1]]
# extract the column names and make them really nice and informative
col_names <- doc[2:3]
gsub(
"_+", "_",
gsub(
"[[:punct:]]", "_",
gsub(
"%", "pct", tolower(
sprintf(
"%s_%s",
unlist((strsplit(trimws(col_names[1]), "[[:space:]]+"))),
unlist((strsplit(trimws(col_names[2]), "[[:space:]]+")))
)
)
)
)
) -> col_names
# parse the values correctly (this is better than read.table)
con <- textConnection(doc[-c(1:4)])
read.fwf(
file = con,
widths = rep(7, 11),
header = FALSE,
colClasses = rep("character", 11), # we'll convert them ourselves, tyvm
stringsAsFactors=FALSE
) -> xdf
# get rid of white space in each column
xdf[] <- lapply(xdf, trimws)
# turn them all numeric
xdf[] <- suppressWarnings(lapply(xdf, as.numeric))
# set our column names
colnames(xdf) <- col_names
# add the date and from/to hr as columns
xdf$date <- date
xdf$from_hr <- o_from_hr
xdf$to_hr <- o_to_hr
# this affords pretty-printing if you use the tidyverse
class(xdf) <- c("tbl_df", "tbl", "data.frame")
# add the metadata as an unobtrusive attribute
attr(xdf, "meta") <- meta
xdf
}
以上内容已被广泛评论,因此我将不做进一步解释。
现在我们可以进行迭代:
# get the start/end range
startDate <- as.Date("01-11-17", format="%d-%m-%y")
endDate <- as.Date("31-01-18",format="%d-%m-%y")
# make a sequence
days <- seq(startDate, endDate, "1 day")
# apply the sequence — note that I am not going to hit the server >80x for
# an example and *you* should add a Sys.sleep(5) before the call to
# get_sounding_data() to be kind to their servers.
lapply(days[1:4], function(day) {
get_sounding_data(
region = "seasia",
date = day,
from_hr = "00",
to_hr = "00",
station_number = "48657"
)
}) -> soundings_48657
如果某个站点在特定日期没有数据,则会有关于它的警告,因此您可以执行此操作以检查由于没有数据而缺少的天数。
warnings()
## Warning message:
## In get_sounding_data(region = "seasia", date = day, from_hr = "00", :
## Can't get 48657 WMKD Kuantan Observations at 00Z 01 Nov 2017.
这就是我们拥有的,请注意,第一个元素为空白b / c,该日没有数据:
str(soundings_48657, 2)
## List of 4
## $ :'data.frame': 0 obs. of 0 variables
## $ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 4 obs. of 14 variables:
## ..$ pres_hpa : num [1:4] 1006 1000 993 981
## ..$ hght_m : num [1:4] 16 70 132 238
## ..$ temp_c : num [1:4] 24 23.6 23.2 24.6
## ..$ dwpt_c : num [1:4] 23.4 22.4 21.5 21.6
## ..$ relh_pct : num [1:4] 96 93 90 83
## ..$ mixr_g_kg: num [1:4] 18.4 17.4 16.6 16.9
## ..$ drct_deg : num [1:4] 0 0 NA NA
## ..$ sknt_knot: num [1:4] 0 0 NA NA
## ..$ thta_k : num [1:4] 297 297 297 299
## ..$ thte_k : num [1:4] 350 347 345 349
## ..$ thtv_k : num [1:4] 300 300 300 302
## ..$ date : Date[1:4], format: "2017-11-02" "2017-11-02" "2017-11-02" "2017-11-02"
## ..$ from_hr : chr [1:4] "00" "00" "00" "00"
## ..$ to_hr : chr [1:4] "00" "00" "00" "00"
## ..- attr(*, "meta")= chr "48657 WMKD Kuantan Observations at 00Z 02 Nov 2017"
## $ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 83 obs. of 14 variables:
## ..$ pres_hpa : num [1:83] 1005 1000 991 983 973 ...
## ..$ hght_m : num [1:83] 16 62 141 213 302 329 466 595 745 802 ...
## ..$ temp_c : num [1:83] 24.2 24.2 24 23.8 23.3 23.2 23.4 22.7 21.8 21.4 ...
## ..$ dwpt_c : num [1:83] 23.6 23.1 22.9 22.7 22 21.8 22 21.3 20.4 19.7 ...
## ..$ relh_pct : num [1:83] 96 94 94 94 92 92 92 92 92 90 ...
## ..$ mixr_g_kg: num [1:83] 18.6 18.2 18.1 18 17.4 ...
## ..$ drct_deg : num [1:83] 190 210 212 213 215 215 212 210 210 210 ...
## ..$ sknt_knot: num [1:83] 1 3 6 8 11 11 14 16 15 15 ...
## ..$ thta_k : num [1:83] 297 297 298 298 299 ...
## ..$ thte_k : num [1:83] 351 350 351 351 350 ...
## ..$ thtv_k : num [1:83] 300 301 301 302 302 ...
## ..$ date : Date[1:83], format: "2017-11-03" "2017-11-03" "2017-11-03" "2017-11-03" ...
## ..$ from_hr : chr [1:83] "00" "00" "00" "00" ...
## ..$ to_hr : chr [1:83] "00" "00" "00" "00" ...
## ..- attr(*, "meta")= chr "48657 WMKD Kuantan Observations at 00Z 03 Nov 2017"
## $ :Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 89 obs. of 14 variables:
## ..$ pres_hpa : num [1:89] 1005 1001 1000 993 987 ...
## ..$ hght_m : num [1:89] 16 54 63 125 178 304 322 597 747 898 ...
## ..$ temp_c : num [1:89] 24.8 24.4 24.4 23.6 25 24.6 24.6 22.9 22 20.9 ...
## ..$ dwpt_c : num [1:89] 24.2 23.4 23.2 21.8 22.6 21.4 21.2 20.4 19.9 19.1 ...
## ..$ relh_pct : num [1:89] 96 94 93 90 87 82 81 86 88 89 ...
## ..$ mixr_g_kg: num [1:89] 19.4 18.5 18.3 16.9 17.9 ...
## ..$ drct_deg : num [1:89] 0 264 240 237 235 230 229 210 205 200 ...
## ..$ sknt_knot: num [1:89] 0 2 2 4 5 9 10 19 20 19 ...
## ..$ thta_k : num [1:89] 298 298 298 297 299 ...
## ..$ thte_k : num [1:89] 354 351 351 346 352 ...
## ..$ thtv_k : num [1:89] 301 301 301 300 302 ...
## ..$ date : Date[1:89], format: "2017-11-04" "2017-11-04" "2017-11-04" "2017-11-04" ...
## ..$ from_hr : chr [1:89] "00" "00" "00" "00" ...
## ..$ to_hr : chr [1:89] "00" "00" "00" "00" ...
## ..- attr(*, "meta")= chr "48657 WMKD Kuantan Observations at 00Z 04 Nov 2017"
现在,我们有了一个数据框列表(有些是空的),我们需要将它们变成一个大而整洁的数据框。我们将删除空的并将它们压缩在一起:
length_not_zero <- function(x) length(x) > 0
Reduce(
rbind.data.frame,
Filter(length_not_zero, soundings_48657)
)
## # A tibble: 176 x 14
## pres_hpa hght_m temp_c dwpt_c relh_pct mixr_g_kg drct_deg sknt_knot thta_k thte_k thtv_k
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1006 16 24 23.4 96 18.4 0 0 297. 350. 300.
## 2 1000 70 23.6 22.4 93 17.4 0 0 297. 347. 300.
## 3 993 132 23.2 21.5 90 16.6 NA NA 297. 345. 300.
## 4 981 238 24.6 21.6 83 16.9 NA NA 299. 349. 302.
## 5 1005 16 24.2 23.6 96 18.6 190 1 297. 351 300.
## 6 1000 62 24.2 23.1 94 18.2 210 3 297. 350. 301.
## 7 991 141 24 22.9 94 18.1 212 6 298. 351. 301.
## 8 983 213 23.8 22.7 94 18.0 213 8 298. 351. 302.
## 9 973 302 23.3 22 92 17.4 215 11 299. 350. 302.
## 10 970 329 23.2 21.8 92 17.3 215 11 299. 350. 302
## # ... with 166 more rows, and 3 more variables: date <date>, from_hr <chr>, to_hr <chr>
请注意,^^中有日期/从/到(请看底行),因此您可以通过整洁的分析来发疯。
我还没有使用all
/ from_hr
的{{1}}参数对它进行测试,因此可能会破坏它。如果是这样,请执行ping操作,然后我会做什么。
答案 1 :(得分:2)
我已经将该函数封装到non-CRAN package中。您可以:
devtools::install_git("https://gitlab.com/hrbrmstr/unsound.git")
然后:
library(unsound)
library(magick)
library(tidyverse)
startDate <- as.Date("01-11-17", format="%d-%m-%y")
endDate <- as.Date("31-01-18",format="%d-%m-%y")
# make a sequence
days <- seq(startDate, endDate, "1 day")
# apply the sequence — note that I am not going to hit the server >80x for
# an example and *you* should add a Sys.sleep(5) before the call to
# get_sounding_data() to be kind to their servers.
lapply(days[1:4], function(day) {
get_sounding_data(
region = "seasia",
date = day,
from_hr = "00",
to_hr = "00",
station_number = "48657"
)
}) -> soundings_48657
## Warning message:
## In get_sounding_data(region = "seasia", date = day, from_hr = "00", :
## Can't get 48657 WMKD Kuantan Observations at 00Z 01 Nov 2017.
rbind_soundings(soundings_48657)
## # A tibble: 176 x 14
## pres_hpa hght_m temp_c dwpt_c relh_pct mixr_g_kg drct_deg sknt_knot
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1006. 16. 24.0 23.4 96. 18.4 0. 0.
## 2 1000. 70. 23.6 22.4 93. 17.4 0. 0.
## 3 993. 132. 23.2 21.5 90. 16.6 NA NA
## 4 981. 238. 24.6 21.6 83. 16.9 NA NA
## 5 1005. 16. 24.2 23.6 96. 18.6 190. 1.
## 6 1000. 62. 24.2 23.1 94. 18.2 210. 3.
## 7 991. 141. 24.0 22.9 94. 18.1 212. 6.
## 8 983. 213. 23.8 22.7 94. 18.0 213. 8.
## 9 973. 302. 23.3 22.0 92. 17.4 215. 11.
## 10 970. 329. 23.2 21.8 92. 17.3 215. 11.
## # ... with 166 more rows, and 6 more variables: thta_k <dbl>,
## # thte_k <dbl>, thtv_k <dbl>, date <date>, from_hr <chr>, to_hr <chr>
我还添加了一个函数来检索预先生成的地图:
get_sounding_map(
station_number = "48657",
date = Sys.Date()-1,
map_type = "skewt",
map_format = "gif",
region = "seasia",
from_hr = "00",
to_hr = "00"
)