Question

我想在R中使用函数optim()来最小化目标函数。两个优化参数都有约束。

我创建了一个测试样本数据。流是由NA分隔的随机序列数据。函数NAins()可以在这个问题的最后看到。

    flow = c(rep(NA,10),NAins(as.data.frame(runif(5000)), .1)$runif)
    rain = runif (length(flow))
    event = with(rle(!is.na(flow )),cbind(length=lengths[values],position=cumsum(c(1,lengths))[values]));

此功能用于计算r2。

    test_function = function(ndays, event, flow, rain,upboundary){
      flowvolume = rainvolume = raininweek = raininmonth =NULL;
      for (i in 1:(length(event)/2)){
        if (upboundary < event[,'position'][i]){
          flowvolume[i] = sum(flow[(event[,'position'][i]):(event[,'position'][i]+event[,'length'][i]-1)], na.rm = TRUE) # total flow during the non NA period
          rainvolume[i] = sum(rain[(event[,'position'][i]):(event[,'position'][i]+event[,'length'][i]-1)], na.rm = TRUE) # total rain during the non NA period
          raininweek[i] = sum(rain[(event[,'position'][i]-ndays[1]):(event[,'position'][i]-1)], na.rm = TRUE) #total rain imediate before NA with a constrained period of nday[1]
          raininmonth[i] = sum(rain[(event[,'position'][i]-ndays[2]-ndays[1]):(event[,'position'][i]-ndays[1]-1)], na.rm = TRUE) #total rain iprior to nday[1] 
        } else {next}
      }
      -summary(lm(flowvolume ~ rainvolume + raininweek + raininmonth))$r.squared # to minimise R2
    }

这是带约束的优化。

    results= optim(par=c(2,20), lower=c(1,10), upper=c(10,30),method="L-BFGS-B",test_function, event=event, rain=rain, flow=flow,upboundary=30)

在此模拟中，结果总是收敛到凝视位置。如果optim()在这个问题上不是一个好的选择，你能推荐一些其他的包或功能吗？

这是用于创建具有随机NA的样本流数据的函数。

    ################################################################
    # RANDOMLY INSERT A CERTAIN PROPORTION OF NAs INTO A DATAFRAME #
    ################################################################
    NAins <-  NAinsert <- function(df, prop){
      n <- nrow(df)
      m <- ncol(df)
      num.to.na <- ceiling(prop*n*m)
      id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
      rows <- id %/% m + 1
      cols <- id %% m + 1
      sapply(seq(num.to.na), function(x){
        df[rows[x], cols[x]] <<- NA
      }
      )
      return(df)
    }

Answer 1

似乎优化永远不会远离起点，因为这些参数是隐式整数。但optim不知道这一点。它只是看到一个平坦的梯度。

如果您的ndays参数空间很小，正如您在问题中指出的那样，请尝试枚举所有这些组合。这是一个方便的功能。帽子提示How to optimize for integer parameters (and other discontinuous parameter space) in R?。

library(NMOF)
grid<- gridSearch(test_function, list(ndays1=seq(1,10), ndays2=seq(10,22)),
                  event=event, rain=rain, flow=flow, upboundary=30)

grid$minfun
grid$minlevels

注意我必须切断ndays [2]的部分搜索空间，因为它导致了负下标错误。您需要在函数中添加一些检查以测试负下标。

Answer 2

我认为枚举是最好的选择，特别是如果你有很少的变量和一个非常非线性的函数。 Nelder Mead或Hooke Jeeves一定会为您提供本地解决方案。这里的函数看起来非常非线性，在某些区域非常平坦。

你可以使用来自Revolution Analytics的foreach和doParallel等并行软件包获得一些加速。在下面的例子中，我做了穷举搜索的纯粹并行实现。我已经修改了test_function以返回x变量。

test_function2 = function(ndays, event, flow, rain,upboundary){
  flowvolume = rainvolume = raininweek = raininmonth =NULL;
  for (i in 1:(length(event)/2)){
    if (upboundary < event[,'position'][i]){
      flowvolume[i] = sum(flow[(event[,'position'][i]):(event[,'position'][i]+event[,'length'][i]-1)], na.rm = TRUE) # total flow during the non NA period
      rainvolume[i] = sum(rain[(event[,'position'][i]):(event[,'position'][i]+event[,'length'][i]-1)], na.rm = TRUE) # total rain during the non NA period
      raininweek[i] = sum(rain[(event[,'position'][i]-ndays[1]):(event[,'position'][i]-1)], na.rm = TRUE) #total rain imediate before NA with a constrained period of nday[1]
      raininmonth[i] = sum(rain[(event[,'position'][i]-ndays[2]-ndays[1]):(event[,'position'][i]-ndays[1]-1)], na.rm = TRUE) #total rain iprior to nday[1] 
    } else {next}
  }
  rsq=-summary(lm(flowvolume ~ rainvolume + raininweek + raininmonth))$r.squared # to minimise R2
  return(c(ndays,rsq))
}

x1<-c(1:10)
x2<-c(10:30)
a<-expand.grid(x1,x2)

library(foreach)
library(doParallel)

cl <- makePSOCKcluster(4)
registerDoParallel(cl)

mymin <-function(z1,z2) {
  if (z1[[3]]<=z2[[3]]) {
    return(z1)
  } else {
    return(z2)
  }
}

ptm<-proc.time()
#c<-matrix(foreach(i=1:210) %dopar% test_function(as.numeric(a[i,]),event,flow,rain,30),10)
c<-foreach(i=1:210,.combine=mymin) %dopar% test_function2(as.numeric(a[i,]),event,flow,rain,30)
proc.time()-ptm

stopCluster(cl)

此时的运行时间约为4.6秒

> ptm<-proc.time()
> #c<-matrix(foreach(i=1:210) %dopar% test_function(as.numeric(a[i,]),event,flow,rain,30),10)
  > c<-foreach(i=1:210,.combine=mymin) %dopar% test_function2(as.numeric(a[i,]),event,flow,rain,30)
> proc.time()-ptm
user  system elapsed 
0.211   0.030   4.596 
> c
[1]  1.0000000 11.0000000 -0.9363349

对于NMOF的实施，它是11s

> ptm<-proc.time()
> grid<- gridSearch(test_function, list(ndays1=seq(1,10), ndays2=seq(10,30)),
                    +                   event=event, rain=rain, flow=flow, upboundary=30)
2 variables with 10, 21 levels: 210 function evaluations required.
> proc.time()-ptm
user  system elapsed 
10.963   0.004  10.974
> grid$minfun
[1] -0.9363349
> grid$minlevels
[1]  1 11

我希望这会有所帮助。如果您打算采用这种方法，请参阅foreach的文档。

加速的另一个选择是使用更快的方法来解决lm，这样你就可以减少单个函数调用评估时间。我在下面的链接中看到了一些选项：

How to compute minimal but fast linear regressions on each column of a response matrix?

用于多变量的optim（）返回R中起始位置的结果

2 个答案: