如何优化这些循环和功能

时间:2015-11-14 18:14:40

标签: r optimization

问题

我正在构建一些天气数据,需要检查并确保没有异常值,值等于-9999,并且没有丢失天数。如果找到这些条件中的任何一个,我写了一个函数nearest(),它将找到5个最近的站并计算反距离加权值,然后将其插回到找到条件的位置。问题是代码有效,但运行需要很长时间。我有600多个电台,每个电台大约需要1个小时来计算。

问题

可以优化此代码以缩短计算时间吗?处理以这种方式使用的嵌套for()循环的最佳方法是什么?

代码

以下代码是用作可重现示例的数据集的一小部分。这显然运行得非常快,但是当分布在整个数据集上时需要很长时间。请注意,在输出中,第10行的值中包含NA。运行代码时,将替换该值。

dput:

db_sid <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4, 
    element = "TMAX", firstyear = 1892L, lastyear = 1952L, state = "arkansas"), .Names = c("id", 
"lat", "long", "element", "firstyear", "lastyear", "state"), row.names = 5L, class = "data.frame")

output <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX", 
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = c(1900, 
1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900), month = c(1, 
1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 
1), date = structure(c(-25567, -25567, -25536, -25536, -25508, 
-25508, -25477, -25477, -25447, -25447), class = "Date"), value = c(30.02, 
10.94, 37.94, 10.94, NA, 28.04, 64.94, 41, 82.04, 51.08)), .Names = c("id", 
"element", "year", "month", "day", "date", "value"), row.names = c(NA, 
-10L), class = c("tbl_df", "data.frame"))

newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632", 
"USC00031632", "USC00031632"), element = structure(c(1L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"), 
    year = c("1900", "1900", "1900", "1900", "1900", "1900", 
    "1900", "1900", "1900", "1900"), month = c("01", "01", "02", 
    "02", "03", "04", "04", "05", "05", "01"), day = c("01", 
    "01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567, 
    -25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447, 
    -25566), class = "Date"), value = c(30.02, 10.94, 37.94, 
    10.94, 28.04, 64.94, 41, 82.04, 51.08, NA)), .Names = c("id", 
"element", "year", "month", "day", "date", "value"), row.names = c(NA, 
10L), class = "data.frame")

stack <- structure(list(id = c("USC00035754", "USC00236357", "USC00033466", 
"USC00032930"), x = c(-92.0189, -95.1464, -93.0486, -94.4481), 
    y = c(34.2256, 39.9808, 34.5128, 36.4261), value = c(62.06, 
    44.96, 55.94, 57.92)), row.names = c(NA, -4L), class = c("tbl_df", 
"tbl", "data.frame"), .Names = c("id", "x", "y", "value"))

station <- structure(list(id = "USC00031632", lat = 36.4197, long = -90.5858, 
    value = 30.02), row.names = c(NA, -1L), class = c("tbl_df", 
"data.frame"), .Names = c("id", "lat", "long", "value"))

nearest()功能:

nearest <- function(id, yr, mnt, dy, ele, out, stack, station){

  if (dim(stack)[1] >= 1){
    ifelse(dim(stack)[1] == 1, v <- stack$value, v <- idw(stack$value, stack[,2:4], station[,2:3])) 
  } else {
    ret <- filter(out, id == s_id & year == yr, month == mnt, element == ele, value != -9999)
    v <- mean(ret$value) 
  } 
  return(v)
}

for()循环:

library(dplyr)
library(phylin)
library(lubridate)

for (i in unique(db_sid$id)){

  # Check for outliers
  for(j in which(output$value > 134 | output$value < -80 | output$value == -9999)){
    output[j,7] <- nearest(id = j, yr = as.numeric(output[j,3]), mnt = as.numeric(output[j,4]), dy = as.numeric(output[j,5]),
                           ele = as.character(output[j,2]), out = output)
  }

  # Check for NA and replace
  for (k in which(is.na(newdat$value))){
   newdat[k,7] <- nearest(id = k, yr = as.numeric(newdat[k,3]), mnt = as.numeric(newdat[k,4]), dy = as.numeric(newdat[k,5]),
                           ele = as.character(newdat[k,2]), out = newdat, stack = stack, station = station)
  }

}

1 个答案:

答案 0 :(得分:1)

我不确定我完全理解你正在尝试做什么。例如,外部for循环中的i从未实际使用过。以下是一些我认为对您有用的代码:

library(plyr)
library(dplyr)

output_summary = 
  output %>%
  filter(value %>% between(-80, 134) ) %>%
  group_by(date, element, id) %>%
  summarize(mean_value = mean(value))

if (nrow(stack) == 1) fill_value = stack$value else
  fill_value = idw(
    stack$value,
    stack %>% select(x, y, value),
    station %>% select(lat, long) )

newdat_filled = 
  newdat %>%
  mutate(filled_value = 
           value %>% 
           mapvalues(NA, fill_value) )