在R

时间:2018-10-12 08:47:19

标签: r function random random-forest

我受命用R中的引导程序手动编写随机森林预测变量。

代码完成后,我意识到该模型返回的行数是输入测试数据的两倍,这很奇怪...

有人可以帮我看看我哪里出问题了吗?代码粘贴在下面,有问题的部分标有星号。

data_train <- read.csv("A3_train.csv") 
data_test <- read.csv("A3_test.csv")
data_train$y <- as.factor(data_train$y) 

set.seed(1234)

data_use <- data_train[1:1500,]
data_compute <- data_train[1501:2000,]

train_random_forest <- function(n_trees, n_features,                             
training_data, target_col_name){
models <- lapply(1:n_trees, function(i) {

自举

n_samples <- nrow(training_data)
sample_row_use <- sample(1:n_samples, n_samples, replace=TRUE)
new_training_data <- training_data[sample_row_use, ]

sub_dataset <- subset(new_training_data,select=-c(y))
feature_subset <- sub_dataset[sample_row_use,1:n_features]
y = new_training_data$y
new_training_data <- cbind(feature_subset,y)

formula <- as.formula(paste(target_col_name, '~.'))
new_model <- rpart(formula, data=new_training_data)

best_cp <- new_model$cptable[which.min(new_model$cptable[,'xerror']), 'CP']
new_model = rpart(formula, data=new_training_data, control = rpart.control(cp=best_cp))
return(new_model)
})
return(models)
}

predict_random_forest <- function(models, test_data) {
preds <- sapply(models, function(model) {
return(predict(model, test_data))
})
return(rowSums(preds) / length(models))
}

models_rf <- train_random_forest(50, 4, data_use, 'y')

**pred_rf_prob <- predict_random_forest(models_rf, data_compute)**

0 个答案:

没有答案