我是编码的新手,所以请在这里忍受我。我必须运行一个适合的Roc曲线,但下面的代码并没有为我画线。 我试图用变量年龄和性别预测etype = 2即死亡。 癌症是数据集的名称。
谁能告诉我这里做错了什么?
非常感谢!
cancer <- read.csv("C:/Users/Jennifer/Desktop/SurvivalRatesforColonCancer.csv")
print(cancer)
#run descritpive stats
describe(cancer)
summary(cancer)
hist(cancer$age)
skewness(cancer$age)
kurtosis(cancer$age)
#Create a training and testing dataset
bound <- floor((nrow(cancer)/2))
print(bound)
cancer <- cancer[sample(nrow(cancer)),]
cancer.train <- cancer[1:bound, ]
cancer.test <- cancer[(bound+1):nrow(cancer), ]
print(cancer.train)
#create decision tree using rpart
fit <- rpart(etype ~ age + sex, method="class", data=cancer.train)
printcp(fit)
plotcp(fit)
summary(fit)
#Display decision tree
plot(fit, uniform = TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=0.6)
#predict using the test dataset
pred1 <- predict(fit, cancer.test, type="class")
#Place the prediction variable back in the dataset
cancer.test$pred1 <- pred1
#show re-substitution error
table(cancer.train$etype, predict(fit, type="class"))
#Display accuracy rate
sum(cancer.test$etype==pred1)/length(pred1)
#Display Confusion Matrix
table(cancer.test$etype,cancer.test$pred1)
#prune the tree so it isn't overfitted. Prune so that it will automatically minimize the cross-
#validated error
pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
#Display decision tree
plot(pfit, uniform = TRUE)
text(pfit, use.n=TRUE, all=TRUE, cex=0.6)
#Calculate the accuracy rate of the new pruned tree
pred2 <- predict(pfit, cancer.test, type="class")
sum(cancer.test$etype==pred2)/length(pred2)
##############################################
# ROC Curve #
##############################################
# for ROC curve we need probabilties so we can sort cancer.test
cancer.test$etype.probs <- predict(fit,cancer.test, type="prob")[,1] # returns prob of both cats, just need 1
roc.data <- data.frame(cutoffs = c(1,sort(unique(cancer.test$etype.probs),decreasing=T)),
TP.at.cutoff = 0,
TN.at.cutoff = 0)
for(i in 1:dim(roc.data)[1]){
this.cutoff <- roc.data[i,"cutoffs"]
roc.data$TP.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs >= this.cutoff,"etype"] == 1)
roc.data$TN.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs < this.cutoff,"etype"] == 0)
}
roc.data$TPR <- roc.data$TP.at.cutoff/max(roc.data$TP.at.cutoff)
roc.data$FPR <- roc.data$TN.at.cutoff/max(roc.data$TN.at.cutoff)
roc.data$one.minus.FPR <- 1 - roc.data$FPR
with(roc.data,
plot(x=one.minus.FPR,
y=TPR,
type = "l",
xlim=c(0,1),
ylim=c(0,1),
main="ROC Curve for 'Fit'")
)
abline(c(0,1),lty=2)