我正在阅读Luis Torgo使用R教科书进行数据挖掘的第2章中的代码,也可以在这里找到: http://www.dcc.fc.up.pt/~ltorgo/DataMiningWithR/code2.html。
我想准确理解bestScores函数输出中的分数。我认为它们是NMSE(归一化均方误差),但我认为归一化意味着这些分数介于0和1之间。表面上看,分数越低越好,但我想确定。请注意,experimentalComparison函数运行大约需要1-2分钟。
if (require(rpart)==F) install.packages("rpart"); require(rpart)
if (require(DMwR)==F) install.packages("DMwR"); require(DMwR)
data(algae)
algae <- algae[-manyNAs(algae), ]
clean.algae <- knnImputation(algae, k = 10)
lm.a1 <- lm(a1 ~ .,data=clean.algae[,1:12])
rt.a1 <- rpart(a1 ~ .,data=algae[,1:12])
final.lm <- step(lm.a1)
lm.predictions.a1 <- predict(final.lm,clean.algae)
rt.predictions.a1 <- predict(rt.a1,algae)
cv.rpart <- function(form,train,test,...) {
m <- rpartXse(form,train,...)
p <- predict(m,test)
mse <- mean((p-resp(form,test))^2)
c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
}
cv.lm <- function(form,train,test,...) {
m <- lm(form,train,...)
p <- predict(m,test)
p <- ifelse(p < 0,0,p)
mse <- mean((p-resp(form,test))^2)
c(nmse=mse/mean((mean(resp(form,train))-resp(form,test))^2))
}
res <- experimentalComparison(
c(dataset(a1 ~ .,clean.algae[,1:12],'a1')),
c(variants('cv.lm'),
# 3 tree models each with a different
# complexity
variants('cv.rpart',se=c(0,0.5,1))),
# 3 times 10-fold cross-validation
# 1234 is seed
cvSettings(3,10,1234))
getVariant('cv.rpart.v1',res)
DSs <- sapply(names(clean.algae)[12:18],
function(x,names.attrs) {
f <- as.formula(paste(x,"~ ."))
# dataset is a class of objects that represent all necessary
# information on a predictive task
# dataset(formula, data, name)
dataset(f,clean.algae[,c(names.attrs,x)],x)
},
names(clean.algae)[1:11])
res.all <- experimentalComparison(
DSs,
c(variants('cv.lm'),
variants('cv.rpart',se=c(0,0.5,1))
),
cvSettings(5,10,1234))
bestScores(res.all)
这是输出:
> bestScores(res.all)
$a1
system score
nmse cv.rpart.v1 0.64231
$a2
system score
nmse cv.rpart.v3 1
$a3
system score
nmse cv.rpart.v2 1
$a4
system score
nmse cv.rpart.v2 1
$a5
system score
nmse cv.lm.v1 0.9316803
$a6
system score
nmse cv.lm.v1 0.9359697
$a7
system score
nmse cv.rpart.v3 1.029505