随机森林和林木的覆盖决策边界

时间:2019-04-19 14:33:58

标签: r random-forest

我生成一些随机数据,并尝试使用随机森林和增强算法进行拟合,从而覆盖决策边界。我可以在下面重新创建问题。我生成数据,然后使用回归树使用以下代码轻松覆盖决策边界:

library(tidyverse)
# set seed and generate some random data
set.seed(123)
Dat <- tibble(
    x1 = rnorm(100),
    x2 = rnorm(100)
) %>% mutate(y = as_factor(ifelse(x1^2 + x2^2 > 1.39, "A", "B")))

circlepts <- tibble(theta = seq(0, 2*pi, length = 100)) %>%
    mutate(x = sqrt(1.39) * sin(theta), y = sqrt(1.39) * cos(theta))

# graph the data and draw the boundary

p <- ggplot(Dat) + geom_point(aes(x1, x2, color = y)) + coord_fixed() +
    geom_polygon(data = circlepts, aes(x, y), color = "blue", fill = NA)



# convert character to binary inputs making classification easier
binVec = as.vector(Dat$y)
binVec[which(binVec =="A")] = 1
binVec[which(binVec == "B")] = 0

binVec = as.numeric(binVec)
Dat$y = binVec


# split the data up
datasplit <- initial_split(Dat, prop = 0.7)
training_set <- as_tibble(training(datasplit))
testing_set <- as_tibble(testing(datasplit))

tree_fit <- tree(y~ ., training_set)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50))  %>% 
    modelr::add_predictions(tree_fit)

# plot the data with the decision overlay of the tree fit
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

现在,如果我尝试通过随机森林或梯度增强来尝试这样做,则add_predictions无法很好地配合...

rf_fit <- randomForest(y ~ ., data=training_set, mtry = 2, ntree=500)



grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50))  %>% 
    modelr::add_predictions(rf_fit)

p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
##ERROR: Error in if (is.na(out.type)) stop("type must be one of 'response', 'prob', 'vote'") : argument is of length zero

对于梯度增强:

fitBoost <- gbm(y ~ ., data= Dat, distribution = "gaussian",
                 n.trees = 1000)

pred <- predict(fitBoost, newdata=training_set, n.trees=1000)

grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50))  %>% 
    modelr::add_predictions(fitBoost)
### ERROR: Error in paste("Using", n.trees, "trees...\n") : argument "n.trees" is missing, with no default

这似乎是一个非常简单的问题。有人可以帮我吗?

1 个答案:

答案 0 :(得分:1)

以下代码可用于您的随机森林:

training_set$y <- factor(training_set$y)
rf_fit <- randomForest(y ~ ., data=training_set, mtry=2, ntree=500)

grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), 
                 x2 = modelr::seq_range(testing_set$x1, 50))  %>% 
        modelr::add_predictions(rf_fit)

p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

enter image description here

这是梯度增强机的代码:

fitBoost <- gbm(y ~ ., data=Dat, distribution="gaussian",  n.trees=1000)

pred <- predict(fitBoost, newdata=training_set, n.trees=1000)

add_predictions2 <- function (data, model, var = "pred", type = NULL) 
{
    data[[var]] <- predict2(model, data, type = type)
    data
}
predict2 <- function (model, data, type = NULL) 
{
    if (is.null(type)) {
        stats::predict(model, data, n.trees=1000)
    }  else {
        stats::predict(model, data, type = type, n.trees=1000)
    }
}

grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), 
                 x2 = modelr::seq_range(testing_set$x1, 50))  %>% 
        add_predictions2(fitBoost)

p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

enter image description here