我正在使用具有120个特征/变量的数据和使用R的400个观测值。在运行RF并观察重要树木后,我发现我只有10-15个非常重要的树。验证数据的最终预测(X1_Pred)仅提供60%的分数。但是,如果我手动考虑前40个特征/变量,则分数会增加到73%。我知道如何找到重要的树木,但我不知道在预测命令中使用这些重要的树来提高我的分数。提供代码。请帮我解决问题。
library(randomForest)
library(e1071)
library(caret)
#library(ROCR)
setwd("------")
data <- read.csv(file="Corr_Table_All3.csv",header = T)
# Now divide the data in Training (60%)/Validation(40%) parts
sample.ind <- sample(2, nrow(data), replace = T, prob = c(0.6,0.4)) # It creates a vector of size nrow(data) with values 1 and 2.
cross.train <- data[sample.ind==1,]
cross.valid <- data[sample.ind==2,]
# Now remove Last column (Labels) from the TRAINING data only
varNames <- names(cross.train)
varNames <- varNames[!varNames %in% c("X1")]
# Now from formula type from TRAINING data only
varNames1 <- paste(varNames, collapse = "+")
rf.form <- as.formula(paste("X1", varNames1, sep = " ~ "))
# Now apply formula to RF and see important Features/Variables/Predictors for TRAINING data only
cross.rf <- randomForest(rf.form, cross.train, ntree=500, importance=T)
var.imp <- data.frame(importance(cross.rf, type=2))
var.imp$Variables <- row.names(var.imp)
varImpPlot(cross.rf, sort = T, main="Variable Importance", n.var=40)
# This is prediction stage and will be applied on VALIDATION data
cross.valid$X1_Pred <- predict(cross.rf, cross.valid)