目标:获取特定潮汐站全年的潮汐预测数据(参见下面的示例)。
尝试:来自各个帖子的提示,包括this exchange关于天气数据的提示,这些提示似乎最相似。我注意到我想要的数据网站是一个cgi;当我选择参数时,这些参数不会反映在链接地址中。我完全不熟悉处理数据抓取的问题。
library(RCurl)
url <- "http://tbone.biol.sc.edu/tide/tideshow.cgi?site=South+Beach%2C+Yaquina+Bay%2C+Oregon&units=f"
s <- getURL(url)
s <- gsub("<br>\n", s)
dat <- read.csv(con <- textConnection(s))
这是第一个实际给我一个产品的代码,但它不是表中的数据。理想情况下,我想选择选项(例如1年,将开始日期设置为1月1日)。我从来没有这样做过,也不了解HTML编程或开发以了解使用此类网站的工具。
答案 0 :(得分:0)
在同事的帮助下,这里是基于GUI的.cgi站点基于几个标准为多个站点抓取数据的代码。
我必须回到主网站上几步,列出多个网站(超链接),选择我想要的网站,应用GUI选择的标准,然后将其适当地格式化为数据框。
library(rvest)
library(plyr)
library(dplyr)
library(stringr)
#define base url for region (ie site where multiple locations are)
url <- "http://tbone.biol.sc.edu/tide/sites_uswest.html"
#read html from page and extract hyperlinks
#view url to see list of links for multiple locations
l <- url %>%read_html()%>%
html_nodes("a") %>% html_attr("href")
# grep only tideshow pattern to get vector of site links
# grep allows filtering/subsetting using a partial string
sites <- l[grep("*tideshow*", l)]
# remove everything before 'site=' to get correct formatting for url site names
sites <- gsub(".*site=", "", sites)
#generate vector of sites of interest
#don't need to use regex to create the vector;
#you can manipulate the list of sites however you prefer
#here, used | for "or" value for selecting multiple sites at once
sites <- sites[grep("(Waldport\\%2C\\+Alsea|South\\+Beach\\%2C\\+Yaquina|Charleston\\%2C\\+Oregon)(?!.*\\%282\\%29)", sites, perl=TRUE)]
#define starting date of data
year <- "2016"
month <- "01"
day <- "01"
#define number of days for prediction
numberofdays = 366 +365 #no. of days in 2016 + no. days in 2017
# lapply through the site vector, x represents site.
# This will pull data from each site in the vector "sites", and bind it together in a list
o <- lapply(sites, function(x){
# paste together file path using generalized cgi address and defined parameters
path<- paste0("http://tbone.biol.sc.edu/tide/tideshow.cgi?type=table;tplotdir=horiz;gx=640;gy=240;caltype=ndp;interval=00%3A01;glen=",
numberofdays ,
";fontsize=%2B0;units=feet;",
"year=", year, ";month=", month, ";day=", day,
";hour=00;min=01;",
"killsun=1;tzone=local;ampm24=24;colortext=black;colordatum=white;colormsl=yellow;colortics=red;colorday=skyblue;colornight=deep-%3Cbr%20%2F%3Eskyblue;colorebb=seagreen;colorflood=blue;site=",
x,
";d_year=;d_month=Jan;d_day=01;d_hour=00;d_min=00"
)
# use ReadLines to bring in table from each file.
d <- readLines(path, warn=FALSE)
# extract site name
site <- str_extract(string = d[grep("<h2>", d)][1], pattern = "(?<=<h2>)(.*?)(?=</h2>)")
# extract coordinates
coord <- gsub(".*<pre>", "", d[grep("<h2>", d)][1])
# get tide data lines
data <- d[grep("\\d{4}[-]\\d{1,2}[-]\\d{1,2}", d) ]
# bind columns together
all <- cbind(site,coord, data)
})
# bind data.frame from list
df <- ldply( o, rbind.data.frame)
# bind site and coordinate columns with split data columns
tides <- cbind(df[c(1,2)] , str_split_fixed(df$data, "\\s+", 6))
names(tides) <- c("site", "coordinates", "date", "time", "tz", "depth", "units", "tide")
head(tides)
str(tides)
summary(tides)