我在R中使用optimx包来解决优化问题。我有一个非常大的数据集存储在hadoop中,我正在尝试使用带有R的hive流来解决问题。
方法1 : - 使用带有R代码的配置单元素流。
方法2 : - 使用
在同一服务器上运行R控制台中的代码 ITM_NBR FAM_ID OLD_PRICE MIN_COMP MAX_COMP ELAST Units PTC_FAM_ID PTC_ITM_NBR
1 64430 101006 59.98 50.983 68.977 1 230 AB_101006 AB64430
2 16961 101006 59.98 50.983 68.977 1 151 AB_101006 AB16961
例如,下面是用于两种情况的数据集样本。
##library required for the program
library(optimx)
##Variables required for this program
lb = numeric(5)
ub = numeric(5)
exprvec <- expression()
o <- el <- u <- numeric(2)
#**Method 1**
f <- file("stdin")
open(f)
e = read.table(f)
#**Method 2**
e <- read.csv('Dataset1.csv')
names(e) <- c("ITM_NBR","FAM_ID","OLD_PRICE","MIN_COMP","MAX_COMP","ELAST","Units","PTC_FAM_ID","PTC_ITM_NBR","MIN_E","MAX_E")
lb <- max(e$MIN_COMP,e$MIN_E)
ub <- min(e$MAX_COMP,e$MAX_E)
x0 <- mean(c(lb,ub),rm.na = TRUE)
for(j in 1:nrow(e)){
o[j] = e$OLD_PRICE[j]
el[j] = e$ELAST[j]
u[j] = e$Units[j]
exprvec[[j]] <- substitute(x*(1-(x-o)*el/o)*u,list(o=o[j],el=el[j],u=u[j]))
}
eval_obj_f <- function(x){
sum(sapply(exprvec, eval, envir=list(x=x)))
}
## Required options for the optimx package and algorithms
res <- optimx(par = y,fn=eval_obj_f,gr=NULL,lower= lb,upper= ub,method="L- BFGS-B",control=list(maximize = TRUE))
print(res)
#**Method 1 Invocation**
hive> from(select ITM_NBR,FAM_ID,OLD_PRICE,MIN_COMP,MAX_COMP,ELAST,UNITS,PTC_FAM_ID,PTC_ITM_NBR,MIN_E,MAX_E from rtable distribute by PTC_FAM_ID)t1 insert overwrite directory '/user/rapp/t1' reduce ITM_NBR,FAM_ID,OLD_PRICE,MIN_COMP,MAX_COMP,ELAST,UNITS,PTC_FAM_ID,PTC_ITM_NBR,MIN_E,MAX_E using 'rscript.r';
#**Method 2 Invocation**
[m1.hdp22] R CMD BATCH optim.r
#**Method 1 results**
lb= 50.983
ub= 62.979
x0= 56.981
p1 value fevals gevals niter convcode kkt1 kkt2 xtimes
L-BFGS-B 59.98 2.285238e+04 4 4 NA 0 TRUE TRUE 0.001
#**Method 2 results**
lb= 50.983
ub= 62.979
x0= 56.981
p1 value fevals gevals niter convcode kkt1 kkt2 xtimes
L-BFGS-B 53.05923 23247.9 4 4 NA 0 TRUE TRUE 0.001
下面是我用来从Hive表和CSV文件中读取数据以计算最佳值的R代码。两种方法的唯一区别是我读取数据的方式。
@Now()