所以我有一个162 x 151的数据集: -
RT (seconds) 76_TI2 114_DECC 120_Lop 212_PCD
38 4.086 1.2 2.322 0
40 2.732 0.815 1.837 1.113
41 4.049 1.153 2.117 2.354
41 4.049 1.153 2.117 3.838
42 4.56 1.224 2.128 2.38
42 2.96 0.909 1.686 0.972
42 3.237 0.96 1.922 1.202
44 2.989 0.8 1.761 2.034
我想使用10倍交叉验证在其上构建随机森林模型,然后查看每个用于折叠的预测值和实际值的值。我使用的是randomForest包。我做了: -
> set.seed(1500)
> model <- rfcv(x,y, cv.fold=10)
但是我还没有找到一种方法来简单地查看从每个折叠中获得的所有预测值以及与其对应的实际值。我该怎么做呢?
由于
答案 0 :(得分:1)
交叉验证获得的预测值存储在model$predicted[[1]]
中,观察值为y
。如果您想分别查看每个折叠的预测值,您需要获得有关折叠分割的信息。为此,您可以:
1)手动拆分折叠并自行进行交叉验证
2)使用caret
包
3)稍微修改cvrf
以输出此信息 - 将idx
添加到输出列表
rfcv2 <- function (trainx, trainy, cv.fold = 5, scale = "log", step = 0.5,
mtry = function(p) max(1, floor(sqrt(p))), recursive = FALSE,
...)
{
classRF <- is.factor(trainy)
n <- nrow(trainx)
p <- ncol(trainx)
if (scale == "log") {
k <- floor(log(p, base = 1/step))
n.var <- round(p * step^(0:(k - 1)))
same <- diff(n.var) == 0
if (any(same))
n.var <- n.var[-which(same)]
if (!1 %in% n.var)
n.var <- c(n.var, 1)
}
else {
n.var <- seq(from = p, to = 1, by = step)
}
k <- length(n.var)
cv.pred <- vector(k, mode = "list")
for (i in 1:k) cv.pred[[i]] <- trainy
if (classRF) {
f <- trainy
}
else {
f <- factor(rep(1:5, length = length(trainy))[order(order(trainy))])
}
nlvl <- table(f)
idx <- numeric(n)
for (i in 1:length(nlvl)) {
idx[which(f == levels(f)[i])] <- sample(rep(1:cv.fold,
length = nlvl[i]))
}
for (i in 1:cv.fold) {
all.rf <- randomForest(trainx[idx != i, , drop = FALSE],
trainy[idx != i], trainx[idx == i, , drop = FALSE],
trainy[idx == i], mtry = mtry(p), importance = TRUE,
...)
cv.pred[[1]][idx == i] <- all.rf$test$predicted
impvar <- (1:p)[order(all.rf$importance[, 1], decreasing = TRUE)]
for (j in 2:k) {
imp.idx <- impvar[1:n.var[j]]
sub.rf <- randomForest(trainx[idx != i, imp.idx,
drop = FALSE], trainy[idx != i], trainx[idx ==
i, imp.idx, drop = FALSE], trainy[idx == i],
mtry = mtry(n.var[j]), importance = recursive,
...)
cv.pred[[j]][idx == i] <- sub.rf$test$predicted
if (recursive) {
impvar <- (1:length(imp.idx))[order(sub.rf$importance[,
1], decreasing = TRUE)]
}
NULL
}
NULL
}
if (classRF) {
error.cv <- sapply(cv.pred, function(x) mean(trainy !=
x))
}
else {
error.cv <- sapply(cv.pred, function(x) mean((trainy -
x)^2))
}
names(error.cv) <- names(cv.pred) <- n.var
list(n.var = n.var, error.cv = error.cv, predicted = cv.pred, idx = idx)
}
现在你可以致电
model <- rfcv2(x,y, cv.fold=10)
model$idx # returns the folds split.
请注意cvrf
函数不是为纯交叉验证而是为变量选择而设计的。因此,您执行了大量冗余计算。