几天前,我问了一个关于在循环中调用自定义函数的主题,该循环可以通过
的组合很好地解决。 eval(parse(text = Function text))
这是链接:Automatic creation and use of custom made function in R。
这使我可以使用for loop
并从存储要创建的函数主体的数据框中自动调用所需的函数。
现在,我想将问题带到一个新的高度。我的问题是计算时间。我需要从高光谱图像中评估52个指标。这意味着在R中,我的高光谱图像被加载为512x512x204波段的3d阵列。
我想做的是并行运行索引的评估,以减少计算时间。 这是我要模拟的虚拟示例,但不是并行计算。
# create a fake matrix rappresenting my Hyperpectral image
HYPR_IMG=array(NA,dim=c(5,3,4))
HYPR_IMG[,,1]=1
HYPR_IMG[,,2]=2
HYPR_IMG[,,3]=3
HYPR_IMG[,,4]=4
image.plot(HYPR_IMG[,,1], zlim=c(0,20))
image.plot(HYPR_IMG[,,2], zlim=c(0,20))
image.plot(HYPR_IMG[,,3], zlim=c(0,20))
image.plot(HYPR_IMG[,,4], zlim=c(0,20))
#create a fake DF for simulating my indices stored in the dataframe
IDXname=c("IDX1","IDX2","IDX3","IDX4")
IDXFunc=c("HYPR_IMG[,,1] + 3*HYPR_IMG[,,2]",
"HYPR_IMG[,,3] + HYPR_IMG[,,2]",
"HYPR_IMG[,,4] + HYPR_IMG[,,2] - HYPR_IMG[,,3]",
"HYPR_IMG[,,1] + HYPR_IMG[,,4] + 4*HYPR_IMG[,,2] + HYPR_IMG[,,3]")
IDX_DF=as.data.frame(cbind(IDXname,IDXFunc))
# that was what I did before
Store_DF=data.frame(NA)
for (i in 1: length(IDX_DF$IDXname)) {
IDX_ID=IDX_DF$IDXname[i]
IDX_Fun_tmp=IDX_DF$IDXFunc[which(IDX_DF$IDXname==IDX_ID)] #use for extra care to select the right fuction
IDXFunc_call=paste("IDXfun_tmp=function(HYPR_IMG){",IDX_Fun_tmp,"}",sep="")
eval(parse(text = IDXFunc_call))
IDX_VAL=IDXfun_tmp (HYPR_IMG)
image.plot(IDX_VAL,zlim=c(0,20)); title(main=IDX_ID)
temp_DF=as.vector(IDX_VAL)
Store_DF=cbind(Store_DF,temp_DF)
names(Store_DF)[i+1] <- as.vector(IDX_ID)
}
我的最终目标是拥有相同的Store_DF
,存储所有的Indices值。在这里,我有一个for
循环,但是使用foreach
循环可以加快速度。如果需要,我将Windows 8或更高版本作为OS。
真的有可能吗?
最后,我是否可以减少具有相同Store_DF
数据帧或类似列名称矩阵的类似物的总体计算时间?
非常感谢!
答案 0 :(得分:1)
对于特定示例,使用诸如data.table之类的程序包的并行构建或并行应用可能会更有益。
以下是一个最小示例,说明如何使用parApply
包中的parallel
获得结果。注意,输出是一个矩阵,实际上在基数R中会产生更好的性能(在tidyverse或data.table中不一定是这种情况)。如果data.frame结构至关重要,则必须使用data.frame
进行转换。
cl <- parallel::makeCluster( parallel::detectCores() )
result <- parallel::parApply(cl = cl, X = IDX_DF, MARGIN = 1, FUN = function(x, IMAGES){
IDX_ID <- x[["IDXname"]]
eval(parse(text = paste0("IDXfun_tmp <- function(HYPR_IMG){", x[["IDXFunc"]], "}")))
IDX_VAL <- as.vector(IDXfun_tmp(IMAGES))
names(IDX_VAL) <- IDX_ID
IDX_VAL
}, IMAGES = HYPR_IMG)
colnames(result) = IDXname
IDXname
parallel::stopCluster(cl)
请注意stopCluster(cl)
,这对于关闭任何松散的R会话很重要。
基准测试结果(4个小核):
Unit: milliseconds
expr min lq mean median uq max neval
Loop 8.420432 9.027583 10.426565 9.272444 9.943783 26.58623 100
Parallel 1.382324 1.491634 2.038024 1.554690 1.907728 18.23942 100
对于基准测试的复制,下面提供了代码:
cl <- parallel::makeCluster( parallel::detectCores() )
microbenchmark::microbenchmark(
Loop = {
Store_DF=data.frame(NA)
for (i in 1: length(IDX_DF$IDXname)) {
IDX_ID = IDX_DF$IDXname[i]
IDX_Fun_tmp = IDX_DF$IDXFunc[which(IDX_DF$IDXname == IDX_ID)] #use for extra care to select the right function
eval(parse(text = paste0("IDXfun_tmp = function(HYPR_IMG){", IDX_Fun_tmp, "}")))
IDX_VAL = IDXfun_tmp(HYPR_IMG)
#Plotting in parallel is not a good idea. It will most often not work but might make the R session crash or slow down significantly (at best the latter at worst the prior)
#image.plot(IDX_VAL, zlim = c(0,20)); title(main = IDX_ID)
temp_DF = as.vector(IDX_VAL)
Store_DF = cbind(Store_DF,temp_DF)
names(Store_DF)[i+1] <- as.vector(IDX_ID)
}
rm(Store_DF)
},
Parallel = {
result <- parallel::parApply(cl = cl, X = IDX_DF, MARGIN = 1, FUN = function(x, IMAGES){
IDX_ID <- x[["IDXname"]]
eval(parse(text = paste0("IDXfun_tmp <- function(HYPR_IMG){", x[["IDXFunc"]], "}")))
IDX_VAL <- as.vector(IDXfun_tmp(IMAGES))
names(IDX_VAL) <- IDX_ID
IDX_VAL
}, IMAGES = HYPR_IMG)
colnames(result) = IDXname
rm(result)
}
)
parallel::stopCluster(cl)
在对性能问题(可能是由于内存)进行了一些评论之后,我决定说明一下如何使用foreach软件包获得相同的结果。一些注意事项:
.combine
参数定义的形式组合,该参数可以是rbind,cbind或任何其他函数。下面的代码将产生与上面相同的输出。注意我已经使用了data.table包。有关此软件包的更多信息,我建议their wikipedia on github.
cl <- parallel::makeCluster( parallel::detectCores() )
#Foeach uses doParallel for the parallization
doParallel::registerDoParallel(cl)
#To iterate over the rows, we need to use iterators
# if foreach is given a matrix it will be converted to a column iterators
rowIterator <- iterators::iter(IDX_DF, by = "row")
library(foreach)
result <-
foreach(
#Supply the iterator
row = rowIterator,
#Specify if the calculations needs to be in order. If not then we can get better performance not doing so
.inorder = FALSE,
#In most foreach loops you will have to export the data you need for the calculations
# it worked without doing so for me, in which case it is faster if the exported stuff is large
#.export = c("HYPR_IMG"),
#We need to say how the output should be merged. If nothing is given it will be output as a list
#data.table rbindlist is faster than rbind (returns a data.table)
.combine = function(...)data.table::rbindlist(list(...)) ,
#otherwise we could've used:
#.combine = rbind
#if we dont use rbind or cbind (i used data.table::rbindlist for speed)
# we will have to tell if it can take more than 1 argument
.multicombine = TRUE
) %dopar% #Specify how to do the calculations. %do% loop. %dopar% parallel loop. %:% nested loops (next foreach tells how we do the loop)
{ #to do stuff in parallel we use the %dopar%. Alternative %do%. For multiple foreach we split each of them by %:%
IDX_ID <- row[["IDXname"]]
eval(parse(text = paste0("IDXfun_tmp <- function(HYPR_IMG){", row[["IDXFunc"]], "}")))
IDX_VAL <- as.vector(IDXfun_tmp(HYPR_IMG))
data.frame(ID = IDX_ID, IDX_VAL)
}
#output is saved in result
result
result_reformatted <- dcast(result[,indx := 1:.N, by = ID],
indx~ID,
value.var = "IDX_VAL")
#if we dont want to use data.table we could use unstack instead
unstack(test, IDX_VAL ~ ID)