我知道randomForest应该是一个黑盒子,并且大多数人都对整个分类器的ROC曲线感兴趣,但我正在研究一个需要检查各个RF树的问题。我对R不是很熟悉,那么为RF生成的单个树绘制ROC曲线的简单方法是什么?
答案 0 :(得分:1)
我不认为您可以从randomForest包生成的随机林中的单个树生成ROC曲线。您可以从预测中访问每个树的输出,例如通过训练集。
# caret for an example data set
library(caret)
library(randomForest)
data(GermanCredit)
# use only 50 rows for demonstration
nrows = 50
# extract the first 9 columns and 50 rows as training data (column 10 is "Class", the target)
x = GermanCredit[1:nrows, 1:9]
y = GermanCredit$Class[1:nrows]
# build the model
rf_model = randomForest(x = x, y = y, ntree = 11)
# Compute the prediction over the training data. Note predict.all = TRUE
rf_pred = predict(rf_model, newdata = x, predict.all = TRUE, type = "prob")
您可以使用
访问每棵树的预测 rf_pred$individual
然而,单个树的预测只是最可能的标签。对于ROC曲线,您需要类概率,因此更改决策阈值会更改预测类别以改变真实和误报率。
据我所知,至少在包randomForest中,没有办法使叶子输出概率而不是标签。如果使用getTree()检查树,您将看到预测是二进制的;使用getTree(rf_model,k = 1,labelVar = TRUE),你将以纯文本形式看到标签。
但是,您可以通过predict.all = TRUE检索单个预测,然后手动计算整个林的子集上的类标签。然后,您可以输入一个函数来计算ROC曲线,就像ROCR包中那样。
编辑:好的,通过您在评论中提供的链接,我了解了如何获得ROC曲线。首先,我们需要提取一个特定的树,然后将每个数据点输入到树中,以便计算每个节点上成功类的出现以及每个节点中的总数据点。该比率给出了成功类的节点概率。接下来,我们做类似的事情,即将每个数据点输入到树中,但现在记录概率。这样我们就可以将类probs与真正的标签进行比较。 这是代码:
# libraries we need
library(randomForest)
library(ROCR)
# Set fixed seed for reproducibility
set.seed(54321)
# Define function to read out output node of a tree for a given data point
travelTree = function(tree, data_row) {
node = 1
while (tree[node, "status"] != -1) {
split_value = data_row[, tree[node, "split var"]]
if (tree[node, "split point"] > split_value ) {
node = tree[node, "right daughter"]
} else {
node = tree[node, "left daughter"]
}
}
return(node)
}
# define number of data rows
nrows = 100
ntree = 11
# load example data
data(GermanCredit)
# Easier access of variables
x = GermanCredit[1:nrows, 1:9]
y = GermanCredit$Class[1:nrows]
# Build RF model
rf_model = randomForest(x = x, y = y, ntree = ntree, nodesize = 10)
# Extract single tree and add variables we need to compute class probs
single_tree = getTree(rf_model, k = 2, labelVar = TRUE)
single_tree$"split var" = as.character(single_tree$"split var")
single_tree$sum_good = 0
single_tree$sum = 0
single_tree$pred_prob = 0
for (zeile in 1:nrow(x)) {
out_node = travelTree(single_tree, x[zeile, ])
single_tree$sum_good[out_node] = single_tree$sum_good[out_node] + (y[zeile] == "Good")
single_tree$sum[out_node] = single_tree$sum[out_node] + 1
}
# Compute class probabilities from count of "Good" data points in each node.
# Make sure we do not divide by zero
idcs = single_tree$sum != 0
single_tree$pred_prob[idcs] = single_tree$sum_good[idcs] / single_tree$sum[idcs]
# Compute prediction by inserting again data set into tree, but read out
# previously computed probs
single_tree_pred = rep(0, nrow(x))
for (zeile in 1:nrow(x)) {
out_node = travelTree(single_tree, x[zeile, ])
single_tree_pred[zeile] = single_tree$pred_prob[out_node]
}
# Et voila: The ROC curve for single tree!
plot(performance(prediction(single_tree_pred, y), "tpr", "fpr"))