我具有带有树桩的离散AdaBoost算法的这种奇怪性能。对测试数据的预测比对训练数据的预测更准确。我使用rpart
命令如下:
boostingDisNew = function(trainData, testData, n.Trees ){
cm=NULL
fmt = matrix(0, nrow= nrow(trainData), ncol = n.Trees)
fms = matrix(0, nrow= nrow(testData) , ncol = n.Trees)
w = rep(1/nrow(trainData), nrow(trainData))
for(i in 1:(n.Trees)){
M1 = rpart(V1~., method="class", data = trainData, weights=w, control = rpart.control(maxdepth=1, cp = -1, minsplit = 0))
#prediction and weights
pt = as.numeric(as.vector(predict(M1, newdata = trainData , type = "class" )))
ms = which(pt != trainData[,1] )
em = sum(w[ms])/sum(w)
cm = log((1-em)/em)
w[ms] = w[ms]*exp(cm)
w = w /sum(w)
###### Prediction on training data
fmt[,i] = cm * pt
###### Prediction on testing data
ps = as.numeric(as.vector(predict(M1, newdata = testData, type = "class")))
fms[,i] = cm * ps
}
return(list(fmt=fmt, fms=fms))
}
DISboost = boostingDisNew(trainData, testData, 1000)
TRAINDISAdaBoost = function(x){
rowSUMS = sign(rowSums(x))
perc = sum(rowSUMS == trainData[,1]) /nrow(trainData)
return(list(rowSUMS=rowSUMS, tab=tab, perc=perc))
}
TESTDISAdaBoost = function(x){
rowSUMS = sign(rowSums(x))
perc = sum(rowSUMS == testData[,1]) /nrow(testData)
return(list(rowSUMS=rowSUMS, tab=tab, perc=perc))
}
mm = NULL ; MMM = seq(10, 1000, by =50)
for(i in 1:length(MMM)){ mm[i] = TRAINDISAdaBoost(DISboost[[1]][,1:MMM[i] ])[[3]]*100}
nn = NULL
for(i in 1:length(MMM)){ nn[i] = TESTDISAdaBoost(DISboost[[2]][,1:MMM[i] ])[[3]]*100}
> mm
[1] 87.4 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.8 88.8 89.3 89.4 89.4 89.4 89.4
> nn
[1] 90.0 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.5 89.9 89.9 89.9 89.9