GBM的自定义分类阈值

时间:2017-07-28 00:37:23

标签: r machine-learning classification r-caret gbm

我正在尝试创建一个自定义GBM模型,用于调整二进制分类问题的分类阈值。在插入符号网站here上提供了一个很好的示例,但是当我尝试应用类似于GBM的内容时,我收到以下错误:

Error in { : task 1 failed - "argument 1 is not a vector"

不幸的是,我不知道错误在哪里,错误也没有用。

以下是一个示例,其中包含我用于定义自定义GBM的代码

library(caret)
library(gbm)
library(pROC)
#### DEFINE A CUSTOM GBM MODEL FOR PROBABILITY THRESHOLD TUNING ####
## Get the model code for the original gbm method from caret
customGBM <- getModelInfo("gbm", regex = FALSE)[[1]]
customGBM$type <- c("Classification")
## Add the threshold (i.e. class cutoff) as another tuning parameter
customGBM$parameters <- data.frame(parameter = c("n.trees", "interaction.depth", "shrinkage",
                                                 "n.minobsinnode", "threshold"),
                                   class = rep("numeric", 5),
                                   label = c("# Boosting Iterations", "Max Tree Depth", "Shrinkage",
                                             "Min. Terminal Node Size", "Probability Cutoff"))
## Customise the tuning grid:
## Some paramters are fixed. Will give a tuning grid of 2,500 values if len = 100
customGBM$grid <- function(x, y, len = NULL, search = "grid") {
  if (search == "grid") {
    grid <- expand.grid(n.trees = seq(50, 250, 50),
                        interaction.depth = 2, ### fix interaction depth at 2
                        shrinkage = 0.0001, ### fix learning rate at 0.0001
                        n.minobsinnode = seq(2, 10, 2),
                        threshold = seq(.01, .99, length = len))
    } else {
    grid <- expand.grid(n.trees = floor(runif(len, min = 1, max = 5000)),
                        interaction.depth = sample(1:10, replace = TRUE, size = len),
                        shrinkage = runif(len, min = .001, max = .6),
                        n.minobsinnode = sample(5:25, replace = TRUE, size = len),
                        threshold = runif(1, 0, size = len))
    grid <- grid[!duplicated(grid),] ### remove any duplicated rows in the training grid
  }
  grid
}

## Here we fit a single gbm model and loop over the threshold values to get predictions from the
## same gbm model.
customGBM$loop = function(grid) {
  library(plyr)
  loop <- ddply(grid, c("n.trees", "shrinkage", "interaction.depth", "n.minobsinnode"),
                function(x) c(threshold = max(x$threshold)))
  submodels <- vector(mode = "list", length = nrow(loop))
  for (i in seq(along = loop$threshold)) {
    index <- which(grid$n.trees == loop$n.trees[i] &
                     grid$interaction.depth == loop$interaction.depth[i] &
                     grid$shrinkage == loop$shrinkage[i] &
                     grid$n.minobsinnode == loop$n.minobsinnode[i])
    cuts <- grid[index, "threshold"]
    submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
  }
  list(loop = loop, submodels = submodels)
}

## Fit the model independent of the threshold parameter
customGBM$fit = function(x, y, wts, param, lev, last, classProbs, ...) {
  theDots <- list(...)
  if (any(names(theDots) == "distribution")) {
    modDist <- theDots$distribution
    theDots$distribution <- NULL
  } else {
    if (is.numeric(y)) {
      stop("This works only for 2-class classification problems")
      } else modDist <- if (length(lev) == 2)  "bernoulli" else
        stop("This works only for 2-class classification problems")
  }
  # if (length(levels(y)) != 2)
  #   stop("This works only for 2-class problems")
  ## check to see if weights were passed in (and availible)
  if (!is.null(wts)) theDots$w <- wts
  if (is.factor(y) && length(lev) == 2) y <- ifelse(y == lev[1], 1, 0)
  modArgs <- list(x = x,
                  y = y,
                  interaction.depth = param$interaction.depth,
                  n.trees = param$n.trees,
                  shrinkage = param$shrinkage,
                  n.minobsinnode = param$n.minobsinnode,
                  distribution = modDist)
  do.call("gbm.fit", modArgs)
}


## Now get a probability prediction and use different thresholds to
## get the predicted class
customGBM$predict = function(modelFit, newdata, submodels = NULL) {
  out <- predict(modelFit, newdata, n.trees = modelFit$tuneValue$n.trees,
                 type = "response")#[, modelFit$obsLevels[1]]
  out[is.nan(out)] <- NA
  class1Prob <- ifelse(out >= modelFit$tuneValue$threshold,
                                modelFit$obsLevels[1],
                                modelFit$obsLevels[2])

  ## Raise the threshold for class #1 and a higher level of
  ## evidence is needed to call it class 1 so it should
  ## decrease sensitivity and increase specificity
  out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
                modelFit$obsLevels[1],
                modelFit$obsLevels[2])
  if (!is.null(submodels)) {
    tmp2 <- out
    out <- vector(mode = "list", length = length(submodels$threshold))
    out[[1]] <- tmp2
    for (i in seq(along = submodels$threshold)) {
      out[[i + 1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
                             modelFit$obsLevels[1],
                             modelFit$obsLevels[2])
    }
  }
  out
}

## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
customGBM$prob = function(modelFit, newdata, submodels = NULL) {
  out <- predict(modelFit, newdata, type = "response",
                 n.trees = modelFit$tuneValue$n.trees)
  out[is.nan(out)] <- NA
  out <- cbind(out, 1 - out)
  colnames(out) <- modelFit$obsLevels
  if (!is.null(submodels)) {
    tmp <- predict(modelFit, newdata, type = "response", n.trees = submodels$n.trees)
    tmp <- as.list(as.data.frame(tmp))
    lapply(tmp, function(x, lvl) {
      x <- cbind(x, 1 - x)
      colnames(x) <- lvl
      x}, lvl = modelFit$obsLevels)
    out <- c(list(out), tmp)
  }
  out
}

fourStats <- function (data, lev = levels(data$obs), model = NULL) {
  ## This code will get use the area under the ROC curve and the
  ## sensitivity and specificity values using the current candidate
  ## value of the probability threshold.
  out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))

  ## The best possible model has sensitivity of 1 and specificity of 1. 
  ## How far are we from that value?
  coords <- matrix(c(1, 1, out["Spec"], out["Sens"]), 
                   ncol = 2, 
                   byrow = TRUE)
  colnames(coords) <- c("Spec", "Sens")
  rownames(coords) <- c("Best", "Current")
  c(out, Dist = dist(coords)[1])
}

然后是一些显示如何使用自定义模型的代码

set.seed(949)
trainingSet <- twoClassSim(500, -9)
mod1 <- train(Class ~ ., data = trainingSet,
              method = customGBM, metric = "Dist",
              maximize = FALSE, tuneLength = 10,
              trControl = trainControl(method = "cv", number = 5,
                                       classProbs = TRUE,
                                       summaryFunction = fourStats))

模型似乎正在运行,但是从上面的错误结束。如果有人可以帮我定制GBM模型来调整GBM参数,以及那些很棒的类的概率阈值。

0 个答案:

没有答案