我试图弄清楚如何从xgboost模型树生成概率,以便它们与我从predict
函数中获得的概率相匹配。
首先我构建模型
library(xgboost)
#install.packages("ModelMetrics")
library(ModelMetrics)
set.seed(100)
# - Extreme gbm
y = as.integer(testDF$y)
x = testDF[,-which(names(testDF) %in% c('y'))]
var.names <- names(x)
x = as.matrix(x)
x = matrix(as.numeric(x),nrow(x),ncol(x))
nround = 10
XX <- xgboost(param=param, data = x, label = y, nrounds=nround, missing = NA)
然后我写了一些代码来构建导致特定叶子的所有规则
baseTree <- xgb.model.dt.tree(model = XX)
Leafs <- filter(baseTree, Feature == 'Leaf')
Branches <- filter(baseTree, Feature != 'Leaf')
Branches$Feature = var.names[as.numeric(Branches$Feature) + 1]
FullRules = rep(NA, nrow(Leafs))
AllRules <- foreach(i = 1:nrow(Leafs), .combine = 'rbind') %do% {
theLeaf = Leafs[i,]
theNode = theLeaf$Node
theID = theLeaf$ID
count = 1
RuleText = ''
while(theNode != 0){
FF <- filter(Branches, Yes == theID | No == theID | Missing == theID)
isYes = FF$Yes == theID
isNo = FF$No == theID
isMissing = FF$Missing == theID
FullRules[i] = ifelse(isYes & isMissing
, paste0("(", FF$Feature, " < ", FF$Split, " | is.na(", FF$Feature, "))")
, NA)
FullRules[i] = ifelse(isNo & isMissing
, paste0("(", FF$Feature, " >= ", FF$Split, " | is.na(", FF$Feature, "))")
, FullRules[i])
FullRules[i] = ifelse(isYes & !isMissing
, paste0(FF$Feature, " < ", FF$Split)
, FullRules[i])
FullRules[i] = ifelse(isNo & !isMissing
, paste0(FF$Feature, " >= ", FF$Split)
, FullRules[i])
FullRules[i] = ifelse(isMissing & !isYes & !isNo
, paste0("is.na(", FF$Feature, ")")
, FullRules[i])
if(count == 1){
RuleText = FullRules[i]
} else{
RuleText = paste0(RuleText, " & ", FullRules[i])
}
theNode = FF$Node
theID = FF$ID
count = count + 1
}
data.frame(
Leafs[i,]
,RuleText
)
}
现在我选出1行并尝试匹配概率。在这种情况下,它匹配。循环将通过并为此特定客户满足的所有规则指示TRUE。然后我可以过滤到那些行并将它们相加以获得logodds估计值。然后我将它们转换为概率。
TT <- testDF[25,]
ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
TT %>% transmute_(
Tree = as.character(AllRules$RuleText[i])
, Quality = AllRules$Quality[i])
}
predict(XX, as.matrix(TT[,var.names]))
#[1] 0.05571342
filter(ff, Tree) %>%
summarise(
Q1 = sum(sqrt(Quality^2))
# ,Q2 = sum(sqrt(Quality^2))
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)
# Q1 Prob1 Prob2
#1 2.830209 0.9442866 0.0557134
但在这种情况下,它与预测函数不匹配......
TT <- testDF[17,]
ff <- foreach(i = 1:nrow(AllRules), .combine = 'rbind') %do% {
TT %>% transmute_(
Tree = as.character(AllRules$RuleText[i])
, Quality = AllRules$Quality[i])
}
predict(XX, as.matrix(TT[,var.names]))
#[1] 0.1386877
filter(ff, Tree) %>%
summarise(
Q1 = sum(sqrt(Quality^2))
# ,Q2 = sum(sqrt(Quality^2))
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)
# Q1 Prob1 Prob2
#1 1.967608 0.877354 0.122646
答案 0 :(得分:1)
要生成预测,您只需要总结每个人在每个助推器中落入的各个叶子的值
filter(ff, Tree) %>%
summarise(
Q1 = sum(Quality)
, Prob1 = exp(Q1)/(1+exp(Q1))
, Prob2 = 1-Prob1
)