这是我想通过RandomForest
包使用随机森林的方式:
library (randomForest)
rf1 <- randomForest(CLA ~ ., dat, ntree=100, norm.votes=FALSE)
p1 <- predict(rf1, testing, type='response')
confMat_rf1 <- table(p1,testing_CLA$CLA)
accuracy_rf1 <- sum(diag(confMat_rf1))/sum(confMat_rf1)
我根本不想使用RandomForest
套餐。给定数据集(dat
)并使用rpart
和默认值randomforest
包,如何获得相同的结果?例如,对于100个决策树,我需要执行以下操作:
for(i in 1:100){
cart.models[[i]]<-rpart(CLA~ ., data = random_dataset[[i]],cp=-1)
}
每个random_dataset[[i]]
将随机选择默认的属性和行数。此外,rpart
用于randomforest
吗?
答案 0 :(得分:1)
可以通过在训练集和训练集的特征上使用rpart和bootstrap样本训练多个树来模拟训练随机森林。 下面的代码片段训练10棵树来对虹膜种类进行分类,并返回每棵树的袋精度的树列表。
library(rpart)
library(Metrics)
library(doParallel)
library(foreach)
library(ggplot2)
random_forest <- function(train_data, train_formula, method="class", feature_per=0.7, cp=0.01, min_split=20, min_bucket=round(min_split/3), max_depth=30, ntrees = 10) {
target_variable <- as.character(train_formula)[[2]]
features <- setdiff(colnames(train_data), target_variable)
n_features <- length(features)
ncores <- detectCores(logical=FALSE)
cl <- makeCluster(ncores)
registerDoParallel(cl)
rf_model <- foreach(
icount(ntrees),
.packages = c("rpart", "Metrics")
) %dopar% {
bagged_features <- sample(features, n_features * feature_per, replace = FALSE)
index_bag <- sample(nrow(train_data), replace=TRUE)
in_train_bag <- train_data[index_bag,]
out_train_bag <- train_data[-index_bag,]
trControl <- rpart.control(minsplit = min_split, minbucket = min_bucket, cp = cp, maxdepth = max_depth)
tree <- rpart(formula = train_formula,
data = in_train_bag,
control = trControl)
oob_pred <- predict(tree, newdata = out_train_bag, type = "class")
oob_acc <- accuracy(actual = out_train_bag[, target_variable], predicted = oob_pred)
list(tree=tree, oob_perf=oob_acc)
}
stopCluster(cl)
rf_model
}
train_formula <- as.formula("Species ~ .")
forest <- random_forest(train_data = iris, train_formula = train_formula)