我目前正在研究randomForest模型。在我的配置中,我意识到高节点化值是使用交叉验证优于配置的配置。
但后来我发现了一些奇怪的事情。
这是一段可复制的代码:
对于nodesize = nrow(数据)+4:
library(randomForest)
library(data.table)
set.seed(1)
n = 10
sigma = 0.4
X = runif(n)
Y = runif(n)
Z = X^2+X*Y+Y+sigma*rnorm(n)
Data = data.table(X,Y,Z)
model = randomForest(formula = as.formula('Z ~ X + Y'),data = Data,mtry = 1,tree= 500,nodesize = n+4,do.trace = TRUE)
pred = predict(model,Data)
print(pred)
1.041549 1.036075 1.266310 1.324197 1.308377 1.480041 1.691081 1.752463 1.203856 1.306943
对于nodesize = nrow(数据)+5
library(randomForest)
library(data.table)
set.seed(1)
n = 10
sigma = 0.4
X = runif(n)
Y = runif(n)
Z = X^2+X*Y+Y+sigma*rnorm(n)
Data = data.table(X,Y,Z)
model = randomForest(formula = as.formula('Z ~ X + Y'),data = Data,mtry = 1,tree= 500,nodesize = n+5,do.trace = TRUE)
pred = predict(model,Data)
print(pred)
1.330427 1.330427 1.330427 1.330427 1.330427 1.330427 1.330427 1.330427 1.330427 1.330427
对于来自nodesize = n + 5的任何观察,预测都是相同的。
n的任何值都是这种情况(n = 20000)。
对我来说,nodesize是节点中记录的最小数量,以便执行拆分。所以这意味着如果我们有n个记录,那么通过用替换来采样n + 4个观察来生长树。因此,如果nodesize> n + 4没有执行拆分并且树返回全局意味着:这就是为什么相同的预测归因于每个观察。这有道理吗?是否有参数来指示从原始数据集中抽取了多少样本?
提前致谢
答案 0 :(得分:0)
nodesize
做什么的误解是正确的。它是任何终端节点的最小大小。但是,randomForest
的行为似乎出乎意料(错误?)。它实际上创建了分支n <= nodesize <= n+5
,这就是你找到的。
如果nodesize为10(样本的大小),则不应该有任何分割,但randomForest
仍会进行拆分,从而分割出几个观察值。当nodesize是11:14
之一(此处未显示)时,它也会这样做:
n = 10
sigma = 0.4
set.seed(100)
X = runif(n)
set.seed(200)
Y = runif(n)
set.seed(1)
Z = X^2+X*Y+Y+sigma*rnorm(n)
Data = data.frame(X,Y,Z)
#
# mtry = p, replace = F and sampsize = n to eliminate randomness
#
model = randomForest(formula = Z ~ X + Y, data = Data,
mtry = 2, ntree = 50, nodesize = 10, replace = F, sampsize = n)
grid <- expand.grid(X = seq(from = min(Data$X), to = max(Data$X), length.out = 100),
Y = seq(from = min(Data$Y), to = max(Data$Y), length.out = 100))
grid$grid_preds <- predict(model, grid)
ggplot(grid, aes(x = X, y = Y)) + geom_point(aes(color = grid_preds)) +
geom_point(data = Data, aes(x = X, y = Y, size = 4), color = "blue") +
theme(legend.position = "none")
如果您将nodesize
设置为n + 5
或更高randomForest
,则不再按预期进行任何拆分:
model = randomForest(formula = Z ~ X + Y, data = Data,
mtry = 2, ntree = 50, nodesize = 15, replace = F, sampsize = n)
grid <- expand.grid(X = seq(from = min(Data$X), to = max(Data$X), length.out = 100),
Y = seq(from = min(Data$Y), to = max(Data$Y), length.out = 100))
grid$grid_preds <- predict(model, grid)
ggplot(grid, aes(x = X, y = Y, color = grid_preds)) + geom_point()
作为比较,ranger
显示预期的行为,如果min.node.size >= n
,则不会尝试任何拆分:
library(ranger)
rang = ranger(Z ~ X + Y, data = Data, write.forest = T,
replace = F, sample.fraction = 1,
mtry = 2, num.trees = 50, min.node.size = 10)
grid$grid_preds <- predict(rang, grid)$prediction
ggplot(grid, aes(x = X, y = Y, color = grid_preds)) + geom_point()
顺便说一句,n+5
中的randomForest
规则也适用于除{10}之外的n
。我也想知道那里发生了什么。