Question

我具有带有树桩的离散AdaBoost算法的这种奇怪性能。对测试数据的预测比对训练数据的预测更准确。我使用rpart命令如下：

boostingDisNew = function(trainData, testData, n.Trees ){
  cm=NULL
  fmt = matrix(0, nrow= nrow(trainData), ncol = n.Trees)
  fms = matrix(0, nrow= nrow(testData) , ncol = n.Trees)
  w   = rep(1/nrow(trainData), nrow(trainData))

  for(i in 1:(n.Trees)){
    M1  = rpart(V1~., method="class",  data = trainData, weights=w, control = rpart.control(maxdepth=1, cp = -1, minsplit = 0))

    #prediction and weights 
    pt    = as.numeric(as.vector(predict(M1, newdata = trainData , type = "class" )))
    ms    = which(pt != trainData[,1] ) 
    em    = sum(w[ms])/sum(w)
    cm    = log((1-em)/em)
    w[ms] =  w[ms]*exp(cm)
    w     = w /sum(w)
    ###### Prediction on training data 
    fmt[,i] = cm * pt
    ###### Prediction on testing data  
    ps      =  as.numeric(as.vector(predict(M1, newdata = testData, type = "class")))
    fms[,i] = cm * ps 
  }
  return(list(fmt=fmt, fms=fms)) 
}



DISboost = boostingDisNew(trainData, testData, 1000)

TRAINDISAdaBoost = function(x){
  rowSUMS = sign(rowSums(x))
  perc    = sum(rowSUMS ==  trainData[,1]) /nrow(trainData)
  return(list(rowSUMS=rowSUMS, tab=tab, perc=perc))
}


TESTDISAdaBoost = function(x){
  rowSUMS = sign(rowSums(x))
  perc    = sum(rowSUMS ==  testData[,1]) /nrow(testData)
  return(list(rowSUMS=rowSUMS, tab=tab, perc=perc))
}

mm  = NULL ; MMM = seq(10, 1000, by =50)  
for(i in 1:length(MMM)){ mm[i] = TRAINDISAdaBoost(DISboost[[1]][,1:MMM[i] ])[[3]]*100}

nn  = NULL
for(i in 1:length(MMM)){ nn[i] = TESTDISAdaBoost(DISboost[[2]][,1:MMM[i] ])[[3]]*100}
> mm 
 [1] 87.4 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.5 88.8 88.8 89.3 89.4 89.4 89.4 89.4
> nn
 [1] 90.0 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.7 90.5 89.9 89.9 89.9 89.9

泛化错误可以克服带有树桩的离散AdaBoost中的训练错误吗？

0 个答案: