Question

我是R的新手（第2天），他的任务是建造一片随意森林。每个随机森林将使用不同的训练集建立，我们将结束所有森林进行预测。我在R中实现这个，并且在使用不使用相同集合构建的两个森林时遇到一些困难。我的尝试如下：

d1 = read.csv("../data/rr/train/10/chunk0.csv",header=TRUE)
d2 = read.csv("../data/rr/train/10/chunk1.csv",header=TRUE)

rf1 = randomForest(A55~., data=d1, ntree=10)
rf2 = randomForest(A55~., data=d2, ntree=10)

rf = combine(rf1,rf2)

这当然会产生错误：

Error in rf$votes + ifelse(is.na(rflist[[i]]$votes), 0, rflist[[i]]$votes) : 
non-conformable arrays
In addition: Warning message:
In rf$oob.times + rflist[[i]]$oob.times :
longer object length is not a multiple of shorter object length

我一直在网上浏览一段时间，但尚未取得任何成功。这里的任何帮助将非常感激。

Answer 1

阿。这可能是combine中的疏忽，或者您尝试做的事情是荒谬的，具体取决于您的观点。

投票矩阵记录每个响应类别的培训数据中每个案例的林中投票数。当然，它将与训练数据中的行数具有相同的行数。

combine假设您在相同的数据集上运行了两次随机森林，因此这些矩阵的维度将是相同的。这样做是因为它希望为组合林提供一些“总体”错误估计。

但是如果两个数据集不同组合，则投票矩阵变得简单无意义。您可以通过简单地从较大的训练数据集中删除一行来获得combine，但是组合林中的结果投票矩阵将是乱码，因为每行将是两个不同训练案例的投票组合。

所以也许这只是一个可以在combine中关闭的选项。因为仍然有意义将实际树和predict组合在生成的对象上。但是combine输出中的一些“组合”误差估计将毫无意义。

长话短说，让每个训练数据集大小相同，并且会运行。但是如果你这样做，除了做出新的预测之外，我不会将结果对象用于其他任何事情。任何结合起来总结森林表现的东西都是无稽之谈。

但是，我认为使用combine的预期方法是在完整数据集上容纳多个随机林，但减少了树的数量然后组合这些森林。

修改

我继续修改combine以“处理”不相等的训练集大小。所有这一切都意味着我删除了一大堆代码，这些代码试图将不匹配的东西拼接在一起。但我保留了组合森林的部分，因此您仍然可以使用predict：

my_combine <- function (...) { pad0 <- function(x, len) c(x, rep(0, len - length(x))) padm0 <- function(x, len) rbind(x, matrix(0, nrow = len - nrow(x), ncol = ncol(x))) rflist <- list(...) areForest <- sapply(rflist, function(x) inherits(x, "randomForest")) if (any(!areForest)) stop("Argument must be a list of randomForest objects") rf <- rflist[[1]] classRF <- rf$type == "classification" trees <- sapply(rflist, function(x) x$ntree) ntree <- sum(trees) rf$ntree <- ntree nforest <- length(rflist) haveTest <- !any(sapply(rflist, function(x) is.null(x$test))) vlist <- lapply(rflist, function(x) rownames(importance(x))) numvars <- sapply(vlist, length) if (!all(numvars[1] == numvars[-1])) stop("Unequal number of predictor variables in the randomForest objects.") for (i in seq_along(vlist)) { if (!all(vlist[[i]] == vlist[[1]])) stop("Predictor variables are different in the randomForest objects.") } haveForest <- sapply(rflist, function(x) !is.null(x$forest)) if (all(haveForest)) { nrnodes <- max(sapply(rflist, function(x) x$forest$nrnodes)) rf$forest$nrnodes <- nrnodes rf$forest$ndbigtree <- unlist(sapply(rflist, function(x) x$forest$ndbigtree)) rf$forest$nodestatus <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$nodestatus, nrnodes))) rf$forest$bestvar <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$bestvar, nrnodes))) rf$forest$xbestsplit <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$xbestsplit, nrnodes))) rf$forest$nodepred <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$nodepred, nrnodes))) tree.dim <- dim(rf$forest$treemap) if (classRF) { rf$forest$treemap <- array(unlist(lapply(rflist, function(x) apply(x$forest$treemap, 2:3, pad0, nrnodes))), c(nrnodes, 2, ntree)) } else { rf$forest$leftDaughter <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$leftDaughter, nrnodes))) rf$forest$rightDaughter <- do.call("cbind", lapply(rflist, function(x) padm0(x$forest$rightDaughter, nrnodes))) } rf$forest$ntree <- ntree if (classRF) rf$forest$cutoff <- rflist[[1]]$forest$cutoff } else { rf$forest <- NULL } # #Tons of stuff removed here... # if (classRF) { rf$confusion <- NULL rf$err.rate <- NULL if (haveTest) { rf$test$confusion <- NULL rf$err.rate <- NULL } } else { rf$mse <- rf$rsq <- NULL if (haveTest) rf$test$mse <- rf$test$rsq <- NULL } rf }

然后你可以像这样测试它：

data(iris) d <- iris[sample(150,150),] d1 <- d[1:70,] d2 <- d[71:150,] rf1 <- randomForest(Species ~ ., d1, ntree=50, norm.votes=FALSE) rf2 <- randomForest(Species ~ ., d2, ntree=50, norm.votes=FALSE) rf.all <- my_combine(rf1,rf2) predict(rf.all,newdata = iris)

显然，这绝对没有保修！：）

在R中组合使用不同训练集构建的随机森林

1 个答案: