Question

我有一个随机森林，目前建立在100个不同的变量上。我希望能够只选择“最重要”的变量来构建我的随机林以尝试提高性能，但我不知道从rf $ important获得重要性的其他方面。

我的数据只包含已经缩放的数值变量。

以下是我的RF代码：

rf.2 = randomForest(x~., data=train,importance=TRUE, ntree=1501)

#train
rf_prob_train = data.frame(predict(rf.2, newdata=train, type="prob"))
rf_prob_train <-data.frame(rf_prob_train$X0)
val_rf_train<-cbind(rf_prob_train,train$x)
names(val_rf_train)<-c("Probs","x")

##Run accuracy ratio
x<-data.frame(rcorr.cens(-val_rf_train$Probs, val_rf_train$x))
rf_train_AR<-x[2,1]
rf_train_AR

#test
rf_prob_test = data.frame(predict(rf.2, test, type="prob"))
rf_prob_test <-data.frame(rf_prob_test$X0)
val_rf_test<-cbind(rf_prob_test,test$x)
names(val_rf_test)<-c("Probs","x")

##Run accuracy ratio
x<-data.frame(rcorr.cens(-val_rf_test$Probs, val_rf_test$x))
rf_test_AR<-x[2,1]
rf_test_AR

Answer 1

忙碌的一天，所以我不能早点告诉你。这使您可以使用通用数据集。

library(randomForest)
library(datasets)

head(iris)
#To make our formula for RF easier to manipulate

var.predict<-paste(names(iris)[-5],collapse="+")
rf.form <- as.formula(paste(names(iris)[5], var.predict, sep = " ~ "))

print(rf.form)
#This is our current itteration of the formula we're using in RF

iris.rf<-randomForest(rf.form,data=iris,importance=TRUE,ntree=100)

varImpPlot(iris.rf)
#Examine our Variable importance plot

to.remove<-c(which(data.frame(iris.rf$importance)$MeanDecreaseAccuracy==min(data.frame(iris.rf$importance)$MeanDecreaseAccuracy)))
#Remove the variable with the lowest decrease in Accuracy (Least relevant variable)

#Rinse, wash hands, repeat

var.predict<-paste(names(iris)[-c(5,to.remove)],collapse="+")
rf.form <- as.formula(paste(names(iris)[5], var.predict, sep = " ~ "))

iris.rf<-randomForest(rf.form,data=iris,importance=TRUE,ntree=100)

varImpPlot(iris.rf)
#Examine our Variable importance plot

to.remove<-c(to.remove, which(data.frame(iris.rf$importance)$MeanDecreaseAccuracy==min(data.frame(iris.rf$importance)$MeanDecreaseAccuracy)))

#And so on...

随机森林变量选择

1 个答案: