如何从xgboost模型手动构建预测

时间:2016-11-15 17:31:58

标签: r xgboost

我试图弄清楚如何从xgboost模型树生成概率,以便它们与我从predict函数中获得的概率相匹配。

首先我构建模型

library(xgboost)
#install.packages("ModelMetrics")
library(ModelMetrics)

set.seed(100)

# - Extreme gbm
y = as.integer(testDF$y)

x = testDF[,-which(names(testDF) %in% c('y'))]
var.names <- names(x)
x = as.matrix(x)
x = matrix(as.numeric(x),nrow(x),ncol(x))

nround = 10

XX <- xgboost(param=param, data = x, label = y, nrounds=nround, missing = NA)

然后我写了一些代码来构建导致特定叶子的所有规则

baseTree <- xgb.model.dt.tree(model = XX)

Leafs <- filter(baseTree, Feature == 'Leaf')
Branches <- filter(baseTree, Feature != 'Leaf')

Branches$Feature = var.names[as.numeric(Branches$Feature) + 1]

FullRules = rep(NA, nrow(Leafs))


AllRules <- foreach(i = 1:nrow(Leafs), .combine = 'rbind') %do% {

  theLeaf = Leafs[i,]
  theNode = theLeaf$Node
  theID = theLeaf$ID

  count = 1

  RuleText = ''
  while(theNode != 0){

    FF <- filter(Branches, Yes == theID | No == theID | Missing == theID)
    isYes = FF$Yes == theID
    isNo = FF$No == theID
    isMissing = FF$Missing == theID

    FullRules[i] = ifelse(isYes & isMissing
      , paste0("(", FF$Feature, " < ", FF$Split, " | is.na(", FF$Feature, "))")
      , NA)
    FullRules[i] = ifelse(isNo & isMissing
      , paste0("(", FF$Feature, " >= ", FF$Split, " | is.na(", FF$Feature, "))")
      , FullRules[i])
    FullRules[i] = ifelse(isYes & !isMissing
      , paste0(FF$Feature, " < ", FF$Split)
      , FullRules[i])
    FullRules[i] = ifelse(isNo & !isMissing
      , paste0(FF$Feature, " >= ", FF$Split)
      , FullRules[i])
    FullRules[i] = ifelse(isMissing & !isYes & !isNo
      , paste0("is.na(", FF$Feature, ")")
      , FullRules[i])

    if(count == 1){
      RuleText = FullRules[i]
    } else{
      RuleText = paste0(RuleText, " & ", FullRules[i])
    }

    theNode = FF$Node
    theID = FF$ID
    count = count + 1
  }

  data.frame(
    Leafs[i,]
    ,RuleText
  )

}

现在我选出1行并尝试匹配概率。在这种情况下,它匹配。循环将通过并为此特定客户满足的所有规则指示TRUE。然后我可以过滤到那些行并将它们相加以获得logodds估计值。然后我将它们转换为概率。

TT <- testDF[25,]

ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
  TT %>% transmute_(
    Tree = as.character(AllRules$RuleText[i])
    , Quality = AllRules$Quality[i])
}


predict(XX, as.matrix(TT[,var.names]))
#[1] 0.05571342

filter(ff, Tree) %>% 
  summarise(
    Q1 = sum(sqrt(Quality^2))
    # ,Q2 = sum(sqrt(Quality^2))
    , Prob1 = exp(Q1)/(1+exp(Q1))
    , Prob2 = 1-Prob1
    )
#        Q1     Prob1     Prob2
#1 2.830209 0.9442866 0.0557134

但在这种情况下,它与预测函数不匹配......

TT <- testDF[17,]

ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
  TT %>% transmute_(
    Tree = as.character(AllRules$RuleText[i])
    , Quality = AllRules$Quality[i])
}


predict(XX, as.matrix(TT[,var.names]))
#[1] 0.1386877

filter(ff, Tree) %>% 
  summarise(
    Q1 = sum(sqrt(Quality^2))
    # ,Q2 = sum(sqrt(Quality^2))
    , Prob1 = exp(Q1)/(1+exp(Q1))
    , Prob2 = 1-Prob1
    )
#        Q1    Prob1    Prob2
#1 1.967608 0.877354 0.122646

1 个答案:

答案 0 :(得分:1)

要生成预测,您只需要总结每个人在每个助推器中落入的各个叶子的值

filter(ff, Tree) %>% 
  summarise(
    Q1 = sum(Quality)
    , Prob1 = exp(Q1)/(1+exp(Q1))
    , Prob2 = 1-Prob1
    )