Question

我有一个大型数据帧（~700 nx 36000 p）并计划在R中进行randomForest分析。由于将完整帧发送到randomForest的运行时负担（即使使用并行计算和512 GB RAM），我希望在许多独立运行（Nruns）中向randomForest发送数据帧的不同随机子样本（~5％p）。对于较小的数据帧，我创建了一个foreach循环，将整个数据帧发送到randomForest，并返回一个重要结果矩阵，即dim（p，Nruns）加上3个附加行，其中包含在每个Nrun中生成的一些附加信息。但是，我无法构造脚本的foreach（）组件，以便为每次运行将数据帧的不同子样本发送到randomForest。（子采样包括两个步骤：首先通过对行进行采样（此部分有效）创建平衡数据集（在结果类上），然后选择列的子集。）所需的结果仍然是dim的数据帧（p + 3， Nruns）但每列只包含在该列表示的运行中随机选择的变量的结果（即，没有为该运行选择的变量将缺少值）。当我提交下面的代码（使用下面创建的假数据）时，我收到以下错误： “错误调用组合功能： “ 注意，如代码所示，如果我排除选择随机列的步骤，但保留平衡完成的步骤，我不会得到错误，输出是预期的（带有暗淡（p + 3，Nruns）并且所有单元格都具有非零值。）因此，问题出在完成列采样的代码部分。我想知道是否有人可以建议对下面的代码进行补救，这些代码将为1：Nruns中的每一个执行新的随机子列化（和行）。

感谢您的任何建议。

##########################################################################
# CREATE FAKE DATA
##########################################################################
FAKEinput <- 
data.frame(A=sample(25:75,20, replace=T), B=sample(1:2,20,replace=T), C=as.factor(sample(0:1,20,replace=T,prob=c(0.3,0.7))),
    D=sample(200:350,20,replace=T), E=sample(2300:2500,20,replace=T), F=sample(92000:105000,20,replace=T),
    G=sample(280:475,20,replace=T),H=sample(470:550,20,replace=T),I=sample(2537:2723,20,replace=T),
    J=sample(2984:4199,20,replace=T),K=sample(222:301,20,replace=T),L=sample(28:53,20,replace=T),
    M=sample(3:9,20,replace=T),N=sample(0:2,20,replace=T),O=sample(0:5,20,replace=T),P=sample(0:2,20,replace=T),
    Q=sample(0:2,20,replace=T), R=sample(0:2,20,replace=T), S=sample(0:7,20,replace=T))

##########################################################################
# set FOREST DATASET
##########################################################################
forestData <- FAKEinput

##########################################################################
# set Outcome 
##########################################################################
Outcome <- "C"

##########################################################################
#  set DV
#########################################################################
forestDV <- forestData$C
str(forestDV) #factor

##########################################################################
#set up number of runs:
##########################################################################
Nruns<-5

##########################################################################
#set up ntree
##########################################################################
ntree=100

###########################################################################
#set up mtry:
###########################################################################
mtry=round(sqrt(ncol(forestData)))  #4

###########################################################################
## CREATE DATASET WITH ONLY THE PREDICTORS (I.E., OMIT OUTCOME).
###########################################################################
dropVars <- names(forestData) %in% c(Outcome)
forestPREDICTORS <- forestData[!dropVars] 

###########################################################################
#set seed first to replicate the random draw of seeds
###########################################################################
set.seed(3456)

###########################################################################
# GENERATE Nruns RANDOMSEEDS
###########################################################################
randomseed<- sample(1:(length(forestData[,1])),Nruns, replace=TRUE) #16 16 18 8 11

##########################################
#Load necessary packages into R's memory
##########################################
require(iterators)
require(foreach)
require(parallel)
require(doParallel)
require(randomForest)

###########################################
# Get the number of available logical cores
###########################################
cores <- detectCores()
cores

###########################################
# Print info on computer, OS, cores
###########################################
print(paste('Processor: ', Sys.getenv('PROCESSOR_IDENTIFIER')), sep='')
print(paste('OS: ', Sys.getenv('OS')), sep='')
print(paste('Cores: ', cores, sep=''))

##################################################################################################
#  Set up new function, called ’ImpOOBerr':
# 1 )write in the set random seed part that uses the same ‘i’ from the ‘foreach’ loops 
# 2) save the importance and summary measures output from the random forest run
# 3) combine all of the importance scores and OOB error summary results (as columns) into single matrix
# * other options tried to correct error commented out.
###################################################################################################
ImpOOBerr<-function(y,d) { 
set.seed(randomseed[i])
out.model<-randomForest(y ~ ., 
    data=d, 
    ntree=ntree,
    mtry=mtry,
    nodesize=0.1*nrow(forestData),
    importance=TRUE,
    proximity=FALSE)
# create the frame before filling with values?
#out<-data.frame(matrix(nrow=ncol(forestPREDICTORS)+3, ncol=Nruns))
out<-rbind(importance(out.model, type=1, scale=FALSE),
    mean(out.model$err.rate[,1]),
    rbind(t(t(quantile(out.model$err.rate[,1], probs=c(0.025, 0.975))))))
#rownames(out) <- c(names(forestPREDICTORS),'meanOOB','oobL95CI', 'oobU95CI') # name all the rows
# OR name only newly-added rows since randomForest importance output preserves the variable names
rownames(out)[(nrow(out)-2):nrow(out)]<-c('meanOOB','oobL95CI', 'oobU95CI') 
return(out)
}

###########################################################################
# SET UP THE CLUSTER
###########################################################################
#Setup clusters via parallel/DoParallel
cl.spec <- rep("localhost", 10)
cl <- makeCluster(cl.spec, type="SOCK")
registerDoParallel(cl, cores=10)

###########################################################################
# Employ foreach to carry out randomForest in parallel
##########################################################################
system.time(fakeRF <- foreach(i=1:Nruns, .combine='cbind', .packages='randomForest') 
    %dopar% {    #<<change to %do% to see speed difference

######################################################################################################
# FIRST, BALANCE THE DATASET ON OUTCOME CLASS FOR INPUT TO randomForest CLASSIFICATION
######################################################################################################
dat1<-forestData[forestData$C==1,]
dat0<-forestData[forestData$C==0,]

####################################################
# RESET the seed to make sure it is updating and 
# giving different samples for each run
####################################################
set.seed(randomseed[i])

####################################################
# OVERSAMPLE FROM SMALLER GROUP TO BALANCE DATASET
####################################################
rands=sample(1:dim(dat0)[1],dim(dat1)[1], replace=TRUE) 
balancedCLASS<-rbind(dat0[rands,],dat1) 

######################################################################################################
# NOW DO RANDOM SAMPLES OF THE COLUMNS (VARIABLES) TO CREATE NEW DATA SUBSETS TO SEND TO randomForest
# AT EACH RUN
# NOTE: TO TEST SCRIPT WITHOUT COLUMN SAMPLING, COMMENT OUT ALL SCRIPT BETWEEN TWO "#xxxxxxxxx.." ROWS
# AND UNCOMMENT THE NEXT THREE LINES
######################################################################################################
#forestData<-balancedCLASS
#forestDV<-balancedCLASS$C
#forestPREDICTORS <- balancedCLASS[!names(balancedCLASS) %in% c('C')]

##xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
##xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
####################################################
# PULL OUT PREDICTORS (i.e., exclue the outcome) 
# before sampling the columns
####################################################
PREDICTORS <- balancedCLASS[!names(balancedCLASS) %in% c('C')]

####################################################
# from the row-balanced set created above, 
# draw a 5-column subset for each run
####################################################
randsCOL= sample(1:dim(PREDICTORS)[2], 5, replace=FALSE) 

####################################################
# BIND OUTCOME VAR BACK ONTO RANDOM COL SET
####################################################
Set_BALrandsCOL <- cbind(balancedCLASS$C, balancedCLASS[,randsCOL]) 

####################################################
# FIX OUTCOME NAME (was retained as "balancedCLASS$C")
####################################################
names(Set_BALrandsCOL)[names(Set_BALrandsCOL)=="balancedCLASS$C"] <- "C"

####################################################
# ASSIGN THE OUTCOME OF SAMPLING BACK TO 
# forestData, forestDV and forestPREDICTORS for RF runs
####################################################
forestData<-Set_BALrandsCOL
forestDV<-Set_BALrandsCOL$C
forestPREDICTORS <- Set_BALrandsCOL[!names(Set_BALrandsCOL) %in% c('C')]
##xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
##xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

#############################################################################################
# CALL FUNCTION THAT WILL RUN randomForest AND COMBINE THE OUTPUT FROM EACH RUN
#############################################################################################
ImpOOBerr(forestDV, forestPREDICTORS)
})

##########################
# stop the cluster
##########################
stopCluster(cl)

#############################################################################################
# SAVE THE OUTPUT TO FILE
#############################################################################################
save(fakeRF, file="D:/RF/WORKING/fakeRF.rda")

Answer 1

我已经解决了上述问题。来自r2evans的第一条评论让我可以修复随机列子集部分。然后，我对ImpOOBerr函数进行了一些修改，以强制函数的输出在每次运行中具有相同数量的观察值（和相同的row.names）。这允许cbind在％dopar％语句中工作。感谢您的反馈和建议。

##########################################################################
# CREATE FAKE DATA
##########################################################################
FAKEinput <- 
data.frame(A=sample(25:75,20, replace=T), B=sample(1:2,20,replace=T), C=as.factor(sample(0:1,20,replace=T,prob=c(0.3,0.7))),
    D=sample(200:350,20,replace=T), E=sample(2300:2500,20,replace=T), F=sample(92000:105000,20,replace=T),
    G=sample(280:475,20,replace=T),H=sample(470:550,20,replace=T),I=sample(2537:2723,20,replace=T),
    J=sample(2984:4199,20,replace=T),K=sample(222:301,20,replace=T),L=sample(28:53,20,replace=T),
    M=sample(3:9,20,replace=T),N=sample(0:2,20,replace=T),O=sample(0:5,20,replace=T),P=sample(0:2,20,replace=T),
    Q=sample(0:2,20,replace=T), R=sample(0:2,20,replace=T), S=sample(0:7,20,replace=T))

##########################################################################
# set FOREST DATASET
##########################################################################
forestData0 <- FAKEinput

##########################################################################
# set Outcome 
##########################################################################
Outcome <- "C"

##########################################################################
#  set DV
#########################################################################
forestDV0 <- forestData0$C

##########################################################################
#set up number of runs:
##########################################################################
Nruns<-5

##########################################################################
#set up ntree
##########################################################################
ntree=100

###########################################################################
## CREATE DATASET WITH ONLY THE PREDICTORS (I.E., OMIT OUTCOME).
###########################################################################
dropVars <- names(forestData0) %in% c(Outcome)
forestPREDICTORS0 <- forestData0[!dropVars] 

###########################################################################
# CREATE single-column dataframe, whichi will be used to send the
# FULL SET OF PREDICTORS TO ImpOOBerr() OUTPUT MATRIX
# Automatically-generated column name is unwieldy; change that to Predictor.
###########################################################################
VARS <-data.frame(c(names(forestPREDICTORS0),'ZZZmeanOOB','ZZZoobL95CI', 'ZZZoobU95CI'))
VARS <- rename(VARS, c(c.names.forestPREDICTORS0....ZZZmeanOOB....ZZZoobL95CI....ZZZoobU95CI..="Predictor"))
row.names(VARS) <- VARS$Predictor

###########################################################################
#set seed first to replicate the random draw of seeds
###########################################################################
set.seed(3456)

###########################################################################
# GENERATE Nruns RANDOMSEEDS
###########################################################################
randomseed<- sample(1:Nruns,Nruns, replace=FALSE) 

##########################################
#Load necessary packages into R's memory
##########################################
require(iterators)
require(foreach)
require(parallel)
require(doParallel)
require(randomForest)

###########################################
# Get the number of available logical cores
###########################################
cores <- detectCores()
cores

###########################################
# Print info on computer, OS, cores
###########################################
print(paste('Processor: ', Sys.getenv('PROCESSOR_IDENTIFIER')), sep='')
print(paste('OS: ', Sys.getenv('OS')), sep='')
print(paste('Cores: ', cores, sep=''))

##################################################################################################
# SET UP NEW FUNCTION, called ’ImpOOBerr':
# 1) write in the set random seed part that uses the same ‘i’ from the ‘foreach’ loops 
# 2) save the importance and summary measures output from the random forest run
# 3) combine all of the importance scores and OOB error summary results (as columns) into single matrix
# 4) merge the ANNOTS dataset with each 'out' file so that cbind function will work 
# (requires same # of rows and same row.names)
###################################################################################################
ImpOOBerr<-function(y,d) { 
set.seed(randomseed[i])
out.model<-randomForest(y ~ ., 
    data=d, 
    ntree=ntree,
    mtry=mtry,
    nodesize=0.1*nrow(forestData),
    importance=TRUE,
    proximity=FALSE)
out<-rbind(importance(out.model, type=1, scale=FALSE),
    mean(out.model$err.rate[,1]),
    rbind(t(t(quantile(out.model$err.rate[,1], probs=c(0.025, 0.975))))))
rownames(out)[(nrow(out)-2):nrow(out)]<-c('ZZZmeanOOB','ZZZoobL95CI', 'ZZZoobU95CI') 
out2<- merge(ANNOTS, out, by="row.names", all.x=TRUE)
row.names(out2) <- out2$Row.names
out2 <- out2[,-1]
out2 <- out2[order(row.names(out2)),]
out3 <- data.frame(out2[,-1,drop=FALSE]) # !!!! THIS WORKS !!!
return(out3)
}

###########################################################################
# SET UP THE CLUSTER
###########################################################################
#Setup clusters via parallel/DoParallel
cl.spec <- rep("localhost", 30)
cl <- makeCluster(cl.spec, type="SOCK")
registerDoParallel(cl, cores=30)

###########################################################################
# Employ foreach to carry out randomForest in parallel
##########################################################################
system.time(fakeRF <- foreach(i=1:Nruns, .combine='cbind', .packages='randomForest') 
    %dopar% {    #<<change to %do% to see speed difference

######################################################################################################
# FIRST, BALANCE THE DATASET ON OUTCOME CLASS FOR INPUT TO randomForest CLASSIFICATION
######################################################################################################
dat1<-forestData[forestData$C==1,]
dat0<-forestData[forestData$C==0,]

####################################################
# RESET the seed to make sure it is updating and 
# giving different samples for each run
####################################################
set.seed(randomseed[i])

####################################################
# OVERSAMPLE FROM SMALLER GROUP TO BALANCE DATASET
####################################################
rands=sample(1:dim(dat0)[1],dim(dat1)[1], replace=TRUE) 
balancedCLASS<-rbind(dat0[rands,],dat1) 

######################################################################################################
# SELECT RANDOM SAMPLES OF THE COLUMNS (VARIABLES) TO CREATE NEW DATA SUBSETS TO SEND TO randomForest
# AT EACH RUN
######################################################################################################

#################################################################
# FROM ROW-BALANCED SET CREATED ABOVE (balancedCLASS),
# DRAW A 5% COL (5% OF 35365=1768) SUBSET FOR EACH RUN
# OMIT THE OUTCOME COLUMN (3) FROM THE RANDOM SELECTION
#################################################################
randsCOLs= sample(balancedCLASS[,-c(3)], 5, replace=FALSE) 

####################################################
# BIND OUTCOME VAR BACK ONTO RANDOM COL SET
####################################################
Set_BALrandsCOL <- cbind(balancedCLASS$C, randsCOLs) 

####################################################
# FIX OUTCOME NAME (was retained as "balancedCLASS$C")
####################################################
names(Set_BALrandsCOL)[names(Set_BALrandsCOL)=="balancedCLASS$C"] <- "C"

####################################################
# ASSIGN THE OUTCOME OF SAMPLING BACK TO 
# forestData, forestDV and forestPREDICTORS for RF runs
####################################################
forestData<-Set_BALrandsCOL
forestDV<-Set_BALrandsCOL$C
forestPREDICTORS <- Set_BALrandsCOL[!names(Set_BALrandsCOL) %in% c('C')]

#############################################################################################
# CALL FUNCTION THAT WILL RUN randomForest AND COMBINE THE OUTPUT FROM EACH RUN
#############################################################################################
ImpOOBerr(forestDV, forestPREDICTORS)
})

##########################
# stop the cluster
##########################
stopCluster(cl)

#############################################################################################
# SAVE THE OUTPUT TO FILE
#############################################################################################
save(fakeRF, file="D:/LearningMachines/RF/Knight_ADNI/WORKING/fakeRF.rda")

R：在randomForest（）调用中使用带有sample（）过程的foreach（）

1 个答案: