Question

我正在阅读Max Kuhn的“ Applied Predictive Modeling”中的第11章，我尝试对GermanCredit数据运行代码以制作ROC曲线。我已经将代码复制到了这篇文章的结尾。

在下面的这一行中：尽管没有使用该名称的列，但提到了列Class。

inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]

我将第20列的名称从“ credit_risk”更改为“ Class”，然后上面的行运行正常。

尽管下面的行给出了错误：

creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"]

错误：

Error in `[.data.frame`(predict(logisticReg, GermanCreditTest, type = "prob"),  : 
  undefined columns selected
In addition: Warning message:
In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type ==  :
  prediction from a rank-deficient fit may be misleading

我不知道如何解决此错误。我已经安装了所有必需的软件包。难道我做错了什么，因为书中的代码给出了多个错误很奇怪。谢谢！

### Recreate the model used in the over-fitting chapter

library(caret)
data(GermanCredit)

## First, remove near-zero variance predictors then get rid of a few predictors 
## that duplicate values. For example, there are two possible values for the 
## housing variable: "Rent", "Own" and "ForFree". So that we don't have linear
## dependencies, we get rid of one of the levels (e.g. "ForFree")

GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
GermanCredit$CheckingAccountStatus.lt.0 <- NULL
GermanCredit$SavingsAccountBonds.lt.100 <- NULL
GermanCredit$EmploymentDuration.lt.1 <- NULL
GermanCredit$EmploymentDuration.Unemployed <- NULL
GermanCredit$Personal.Male.Married.Widowed <- NULL
GermanCredit$Property.Unknown <- NULL
GermanCredit$Housing.ForFree <- NULL

names(GermanCredit)[20] <- "Class"

## Split the data into training (80%) and test sets (20%)
set.seed(100)
inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
GermanCreditTrain <- GermanCredit[ inTrain, ]
GermanCreditTest  <- GermanCredit[-inTrain, ]

set.seed(1056)
logisticReg <- train(Class ~ .,
                     data = GermanCreditTrain,
                     method = "glm",
                     trControl = trainControl(method = "repeatedcv", 
                                              repeats = 5))
logisticReg

### Predict the test set
creditResults <- data.frame(obs = GermanCreditTest$Class)
creditResults$prob <- predict(logisticReg, GermanCreditTest, type = "prob")[, "Bad"]
creditResults$pred <- predict(logisticReg, GermanCreditTest)
creditResults$Label <- ifelse(creditResults$obs == "Bad", 
                              "True Outcome: Bad Credit", 
                              "True Outcome: Good Credit")

### Plot the probability of bad credit
histogram(~prob|Label,
          data = creditResults,
          layout = c(2, 1),
          nint = 20,
          xlab = "Probability of Bad Credit",
          type = "count")

### Calculate and plot the calibration curve
creditCalib <- calibration(obs ~ prob, data = creditResults)
xyplot(creditCalib)

### Create the confusion matrix from the test set.
confusionMatrix(data = creditResults$pred, 
                reference = creditResults$obs)

### ROC curves:

### Like glm(), roc() treats the last level of the factor as the event
### of interest so we use relevel() to change the observed class data

library(pROC)
creditROC <- roc(relevel(creditResults$obs, "Good"), creditResults$prob)

coords(creditROC, "all")[,1:3]

auc(creditROC)
ci.auc(creditROC)

### Note the x-axis is reversed
plot(creditROC)

### Old-school:
plot(creditROC, legacy.axes = TRUE)

### Lift charts

creditLift <- lift(obs ~ prob, data = creditResults)
xyplot(creditLift)


################################################################################

预测函数生成错误“未定义的列已选择”

0 个答案: