Question

我是编码的新手，所以请在这里忍受我。我必须运行一个适合的Roc曲线，但下面的代码并没有为我画线。我试图用变量年龄和性别预测etype = 2即死亡。癌症是数据集的名称。

谁能告诉我这里做错了什么？

非常感谢！

 cancer <- read.csv("C:/Users/Jennifer/Desktop/SurvivalRatesforColonCancer.csv")
print(cancer)

#run descritpive stats
describe(cancer)
summary(cancer)
hist(cancer$age)
skewness(cancer$age)
kurtosis(cancer$age)

#Create a training and testing dataset
bound <- floor((nrow(cancer)/2))
print(bound)
cancer <- cancer[sample(nrow(cancer)),]
cancer.train <- cancer[1:bound, ]
cancer.test <- cancer[(bound+1):nrow(cancer), ]

print(cancer.train)

#create decision tree using rpart
fit <- rpart(etype ~ age + sex, method="class", data=cancer.train)
printcp(fit)
plotcp(fit)
summary(fit)

#Display decision tree
plot(fit, uniform = TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=0.6)

#predict using the test dataset
pred1 <- predict(fit, cancer.test, type="class")

#Place the prediction variable back in the dataset
cancer.test$pred1 <- pred1

#show re-substitution error
table(cancer.train$etype, predict(fit, type="class"))

#Display accuracy rate
sum(cancer.test$etype==pred1)/length(pred1)

#Display Confusion Matrix
table(cancer.test$etype,cancer.test$pred1)

#prune the tree so it isn't overfitted.  Prune so that it will automatically minimize the cross-
#validated error 
pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
#Display decision tree
plot(pfit, uniform = TRUE)
text(pfit, use.n=TRUE, all=TRUE, cex=0.6)

#Calculate the accuracy rate of the new pruned tree
pred2 <- predict(pfit, cancer.test, type="class")
sum(cancer.test$etype==pred2)/length(pred2)



##############################################
#               ROC Curve                    #
##############################################

# for ROC curve we need probabilties so we can sort cancer.test
cancer.test$etype.probs <- predict(fit,cancer.test, type="prob")[,1] # returns prob of both cats, just need 1

roc.data <- data.frame(cutoffs = c(1,sort(unique(cancer.test$etype.probs),decreasing=T)),
                       TP.at.cutoff = 0,
                       TN.at.cutoff = 0)

for(i in 1:dim(roc.data)[1]){
  this.cutoff <- roc.data[i,"cutoffs"]
  roc.data$TP.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs >= this.cutoff,"etype"] == 1)
  roc.data$TN.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs < this.cutoff,"etype"] == 0)
}
roc.data$TPR <- roc.data$TP.at.cutoff/max(roc.data$TP.at.cutoff) 
roc.data$FPR <- roc.data$TN.at.cutoff/max(roc.data$TN.at.cutoff) 
roc.data$one.minus.FPR <- 1 - roc.data$FPR

with(roc.data,
     plot(x=one.minus.FPR,
          y=TPR,
          type = "l",
          xlim=c(0,1),
          ylim=c(0,1),
          main="ROC Curve for 'Fit'")     
)
abline(c(0,1),lty=2)

Roc Curve for Fit不在R

0 个答案: