我受命用R中的引导程序手动编写随机森林预测变量。
代码完成后,我意识到该模型返回的行数是输入测试数据的两倍,这很奇怪...
有人可以帮我看看我哪里出问题了吗?代码粘贴在下面,有问题的部分标有星号。
data_train <- read.csv("A3_train.csv")
data_test <- read.csv("A3_test.csv")
data_train$y <- as.factor(data_train$y)
set.seed(1234)
data_use <- data_train[1:1500,]
data_compute <- data_train[1501:2000,]
train_random_forest <- function(n_trees, n_features,
training_data, target_col_name){
models <- lapply(1:n_trees, function(i) {
自举
n_samples <- nrow(training_data)
sample_row_use <- sample(1:n_samples, n_samples, replace=TRUE)
new_training_data <- training_data[sample_row_use, ]
sub_dataset <- subset(new_training_data,select=-c(y))
feature_subset <- sub_dataset[sample_row_use,1:n_features]
y = new_training_data$y
new_training_data <- cbind(feature_subset,y)
formula <- as.formula(paste(target_col_name, '~.'))
new_model <- rpart(formula, data=new_training_data)
best_cp <- new_model$cptable[which.min(new_model$cptable[,'xerror']), 'CP']
new_model = rpart(formula, data=new_training_data, control = rpart.control(cp=best_cp))
return(new_model)
})
return(models)
}
predict_random_forest <- function(models, test_data) {
preds <- sapply(models, function(model) {
return(predict(model, test_data))
})
return(rowSums(preds) / length(models))
}
models_rf <- train_random_forest(50, 4, data_use, 'y')
**pred_rf_prob <- predict_random_forest(models_rf, data_compute)**