我正在构建一些天气数据,需要检查并确保没有异常值,值等于-9999,并且没有丢失天数。如果找到这些条件中的任何一个,我写了一个函数nearest()
,它将找到5个最近的站并计算反距离加权值,然后将其插回到找到条件的位置。问题是代码有效,但运行需要很长时间。我有600多个电台,每个电台大约需要1个小时来计算。
可以优化此代码以缩短计算时间吗?处理以这种方式使用的嵌套for()
循环的最佳方法是什么?
以下代码是用作可重现示例的数据集的一小部分。这显然运行得非常快,但是当分布在整个数据集上时需要很长时间。请注意,在输出中,第10行的值中包含NA。运行代码时,将替换该值。
dput:
db_sid <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4,
element = "TMAX", firstyear = 1892L, lastyear = 1952L, state = "arkansas"), .Names = c("id",
"lat", "long", "element", "firstyear", "lastyear", "state"), row.names = 5L, class = "data.frame")
output <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = c(1900,
1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900), month = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1), date = structure(c(-25567, -25567, -25536, -25536, -25508,
-25508, -25477, -25477, -25447, -25447), class = "Date"), value = c(30.02,
10.94, 37.94, 10.94, NA, 28.04, 64.94, 41, 82.04, 51.08)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
-10L), class = c("tbl_df", "data.frame"))
newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = structure(c(1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"),
year = c("1900", "1900", "1900", "1900", "1900", "1900",
"1900", "1900", "1900", "1900"), month = c("01", "01", "02",
"02", "03", "04", "04", "05", "05", "01"), day = c("01",
"01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567,
-25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447,
-25566), class = "Date"), value = c(30.02, 10.94, 37.94,
10.94, 28.04, 64.94, 41, 82.04, 51.08, NA)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
10L), class = "data.frame")
stack <- structure(list(id = c("USC00035754", "USC00236357", "USC00033466",
"USC00032930"), x = c(-92.0189, -95.1464, -93.0486, -94.4481),
y = c(34.2256, 39.9808, 34.5128, 36.4261), value = c(62.06,
44.96, 55.94, 57.92)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("id", "x", "y", "value"))
station <- structure(list(id = "USC00031632", lat = 36.4197, long = -90.5858,
value = 30.02), row.names = c(NA, -1L), class = c("tbl_df",
"data.frame"), .Names = c("id", "lat", "long", "value"))
nearest()
功能:
nearest <- function(id, yr, mnt, dy, ele, out, stack, station){
if (dim(stack)[1] >= 1){
ifelse(dim(stack)[1] == 1, v <- stack$value, v <- idw(stack$value, stack[,2:4], station[,2:3]))
} else {
ret <- filter(out, id == s_id & year == yr, month == mnt, element == ele, value != -9999)
v <- mean(ret$value)
}
return(v)
}
for()
循环:
library(dplyr)
library(phylin)
library(lubridate)
for (i in unique(db_sid$id)){
# Check for outliers
for(j in which(output$value > 134 | output$value < -80 | output$value == -9999)){
output[j,7] <- nearest(id = j, yr = as.numeric(output[j,3]), mnt = as.numeric(output[j,4]), dy = as.numeric(output[j,5]),
ele = as.character(output[j,2]), out = output)
}
# Check for NA and replace
for (k in which(is.na(newdat$value))){
newdat[k,7] <- nearest(id = k, yr = as.numeric(newdat[k,3]), mnt = as.numeric(newdat[k,4]), dy = as.numeric(newdat[k,5]),
ele = as.character(newdat[k,2]), out = newdat, stack = stack, station = station)
}
}
答案 0 :(得分:1)
我不确定我完全理解你正在尝试做什么。例如,外部for循环中的i从未实际使用过。以下是一些我认为对您有用的代码:
library(plyr)
library(dplyr)
output_summary =
output %>%
filter(value %>% between(-80, 134) ) %>%
group_by(date, element, id) %>%
summarize(mean_value = mean(value))
if (nrow(stack) == 1) fill_value = stack$value else
fill_value = idw(
stack$value,
stack %>% select(x, y, value),
station %>% select(lat, long) )
newdat_filled =
newdat %>%
mutate(filled_value =
value %>%
mapvalues(NA, fill_value) )