
时间:2013-12-09 06:31:47

标签: r random-forest cross-validation

所以我有一个162 x 151的数据集: -

RT (seconds)    76_TI2  114_DECC    120_Lop 212_PCD
38  4.086   1.2 2.322   0
40  2.732   0.815   1.837   1.113
41  4.049   1.153   2.117   2.354
41  4.049   1.153   2.117   3.838
42  4.56    1.224   2.128   2.38
42  2.96    0.909   1.686   0.972
42  3.237   0.96    1.922   1.202
44  2.989   0.8 1.761   2.034

我想使用10倍交叉验证在其上构建随机森林模型,然后查看每个用于折叠的预测值和实际值的值。我使用的是randomForest包。我做了: -

> set.seed(1500)
> model <- rfcv(x,y, cv.fold=10)



1 个答案:

答案 0 :(得分:1)




3)稍微修改cvrf以输出此信息 - 将idx添加到输出列表

rfcv2 <- function (trainx, trainy, cv.fold = 5, scale = "log", step = 0.5, 
          mtry = function(p) max(1, floor(sqrt(p))), recursive = FALSE, 
  classRF <- is.factor(trainy)
  n <- nrow(trainx)
  p <- ncol(trainx)
  if (scale == "log") {
    k <- floor(log(p, base = 1/step))
    n.var <- round(p * step^(0:(k - 1)))
    same <- diff(n.var) == 0
    if (any(same)) 
      n.var <- n.var[-which(same)]
    if (!1 %in% n.var) 
      n.var <- c(n.var, 1)
  else {
    n.var <- seq(from = p, to = 1, by = step)
  k <- length(n.var)
  cv.pred <- vector(k, mode = "list")
  for (i in 1:k) cv.pred[[i]] <- trainy
  if (classRF) {
    f <- trainy
  else {
    f <- factor(rep(1:5, length = length(trainy))[order(order(trainy))])
  nlvl <- table(f)
  idx <- numeric(n)
  for (i in 1:length(nlvl)) {
    idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold, 
                                                length = nlvl[i]))
  for (i in 1:cv.fold) {
    all.rf <- randomForest(trainx[idx != i, , drop = FALSE], 
                           trainy[idx != i], trainx[idx == i, , drop = FALSE], 
                           trainy[idx == i], mtry = mtry(p), importance = TRUE, 
    cv.pred[[1]][idx == i] <- all.rf$test$predicted
    impvar <- (1:p)[order(all.rf$importance[, 1], decreasing = TRUE)]
    for (j in 2:k) {
      imp.idx <- impvar[1:n.var[j]]
      sub.rf <- randomForest(trainx[idx != i, imp.idx, 
                                    drop = FALSE], trainy[idx != i], trainx[idx == 
                                                                              i, imp.idx, drop = FALSE], trainy[idx == i], 
                             mtry = mtry(n.var[j]), importance = recursive, 
      cv.pred[[j]][idx == i] <- sub.rf$test$predicted
      if (recursive) {
        impvar <- (1:length(imp.idx))[order(sub.rf$importance[, 
                                                              1], decreasing = TRUE)]
  if (classRF) {
    error.cv <- sapply(cv.pred, function(x) mean(trainy != 
  else {
    error.cv <- sapply(cv.pred, function(x) mean((trainy - 
  names(error.cv) <- names(cv.pred) <- n.var
  list(n.var = n.var, error.cv = error.cv, predicted = cv.pred, idx = idx)


model <- rfcv2(x,y, cv.fold=10)
model$idx  # returns the folds split.
