Question

互联网上有非常复杂的例子。我无法将它们应用到我的代码中。我有一个包含14个独立变量和1个因变量的数据集。我用R进行分类。这是我的代码：

dataset <- read.table("adult.data", sep = ",", na.strings = c(" ?"))
colnames(dataset) <- c( "age", 
                        "workclass", 
                        "fnlwgt", 
                        "education", 
                        "education.num", 
                        "marital.status", 
                        "occupation", 
                        "relationship", 
                        "race", 
                        "sex", 
                        "capital.gain", 
                        "capital.loss", 
                        "hours.per.week", 
                        "native.country",
                        "is.big.50k")
dataset = na.omit(dataset)

library(caret)
set.seed(1)
traning.indices <- createDataPartition(y = dataset$is.big.50k, p = 0.7, list = FALSE)
training.set <- dataset[traning.indices,]
test.set <- dataset[-traning.indices,]

###################################################################
## Naive Bayes
library(e1071)
classifier = naiveBayes(x = training.set[,-15],
                                    y = training.set$is.big.50k)

prediction = predict(classifier, newdata = test.set[,-15])

cm <- confusionMatrix(data = prediction, reference = test.set[,15], 
                      positive = levels(test.set$is.big.50k)[2])

accuracy <- sum(diag(as.matrix(cm))) / sum(as.matrix(cm))

sensitivity <- sensitivity(prediction, test.set[,15], 
                           positive = levels(test.set$is.big.50k)[2])

specificity <- specificity(prediction, test.set[,15], 
                           negative = levels(test.set$is.big.50k)[1])

我试过这个。有效。有什么错吗？转型过程有什么问题吗？（on as.numeric（）方法）
库（ROCR） pred＆lt; - prediction（as.numeric（prediction），as.numeric（test.set [，15]）） perf＆lt; - performance（pred，measure =＆＃34; tpr＆＃34;，x.measure =＆＃34; fpr＆＃34;） plot（perf，main =＆＃34; NB＆＃34的ROC曲线;， col =＆＃34; blue＆＃34;，lwd = 3） abline（a = 0，b = 1，lwd = 2，lty = 2）

Answer 1

试试这个：

set.seed(1)
library(data.table)
amount = 100
dataset = data.table(
  x = runif(amount, -1, 1)
  ,y = runif(amount, -1, 1)
)
# inside the circle with radius 0.5? -> true, otherwise false
dataset = dataset[, target := (sqrt(x^2 + y^2) < 0.5)]
plot(dataset[target == F]$x, dataset[target == F]$y, col="red", xlim = c(-1, 1), ylim = c(-1, 1))
points(dataset[target == T]$x, dataset[target == T]$y, col="green")

library(caret)

traning.indices <- createDataPartition(y = dataset$target, p = 0.7, list = FALSE)
training.set <- dataset[traning.indices,]
test.set <- dataset[-traning.indices,]

###################################################################
## Naive Bayes
library(e1071)
classifier = naiveBayes(x = training.set[,.(x,y)],
                        y = training.set$target)

prediction = predict(classifier, newdata = test.set[,.(x,y)], type="raw")
prediction = prediction[, 2]
test.set = test.set[, prediction := prediction]

TPrates = c()
TNrates = c()
thresholds = seq(0, 1, by = 0.1)
for (threshold in thresholds) {
  # percentage of correctly classified true examples
  TPrateForThisThreshold = test.set[target == T & prediction > threshold, .N]/test.set[target == T, .N]
  # percentage of correctly classified false examples
  TNrateForThisThreshold = test.set[target == F & prediction <= threshold, .N]/test.set[target == F, .N]

  TPrates = c(TPrates, TPrateForThisThreshold)
  TNrates = c(TNrates, TNrateForThisThreshold)
}

plot(1-TNrates, TPrates, type="l")

说明：

如果你有数字概率＆＃39;你只能绘制一条ROC曲线。预测（即0到1之间的数字）即使您想要预测只能为TRUE或FALSE的东西！ - ＆GT;我们需要把＆＃39; type =＆＃34; raw＆＃34;＆＃39;在预测行prediction = predict(classifier, newdata = test.set[,.(x,y)], type="raw")中，预测行将不会是“真实”。或者＆＃39; FALSE＆＃39;但是0到1之间的数字以及之前的TRUE / FALSE预测是＆＃39; numericPrediction＆gt; = 0.5＆＃39;即如果概率超过阈值，那么它被预测为＆＃39; TRUE＆＃39;并且＆＃39; FALSE＆＃39;否则。

谁告诉我们＆＃39; 0.5＆＃39;是我们预测器的正确值？它不能是0.7还是0.1？正确！我们不知道（临时，没有更多关于问题的知识）哪个阈值是正确的。这就是为什么我们只是尝试所有这些＆＃39; （我只试过0,0.1,0.2，......，0.9,1）并用这些阈值创建一个混淆矩阵。通过这种方式，我们可以看到预测器如何独立于阈值执行。如果该行“鞠躬”很多＆＃39;进入完美分类器的方向（矩形，即只有100％回忆，0％的1特异性），分类器执行得越好。

解释轴！

Y轴意味着：预测器检测到了多少实际上正面的例子？

X轴意味着：预测器花费他的预测有多浪费？

即。如果你想获得一个很好的检测到的真实例子（例如，在预测疾病时，你必须确保每个真正患有疾病的病人都会被检测出来，否则预测者的整个点就会被撤回）。但是，只是将每个人都预测为“真实”。没有帮助！治疗可能是有害的，也可能是昂贵的。因此，我们必须反对玩家（召回=检测到的真实率，1-spec =浪费率＆＃39;预测者）并且ROC曲线上的每个点都是一个可能的预测因子。现在你必须在ROC曲线上选择你想要的点，检查导致这一点的阈值并最终使用这个阈值。

Answer 2

要使ROC曲线起作用，您需要一些阈值或超参数。

贝叶斯分类器的数字输出往往太不可靠（而二元决策通常都没问题），并且没有明显的超参数。您可以尝试将先前概率（仅在二元问题中）作为参数处理，并为此绘制ROC曲线。

但无论如何，对于曲线存在，您需要从某个曲线参数t到TPR，FPR的映射来获取曲线。例如，t可能是你的先验。

如何在R中实现朴素贝叶斯分类算法的roc曲线分析？

2 个答案: