我刚开始使用R编程,我正在使用UCI的葡萄酒数据在R中编写CART算法。
正在使用递归来构建完整的CART树。问题是即使一切正常执行,存储树的矩阵(树)也会在CART函数中的以下行之后被完全删除:
tree <- CART(tree, data[indexL,], K, impurity, size_tree + 1, constants)
R是否有一种特定的方式来执行递归,或者在执行上一行之后只有一个缺失的步骤?
data <- read.csv(file = "wine.csv", header = FALSE)
names(data) <- list("U1", "U2", "U3", "U4", "U5", "U6", "U7", "U8", "U9", "U10", "U11", "U12", "U13", "U14")
data <- subset(data, select = c(U2:U14, U1))
constants <- list(1, 2, 3, 4, 5, 6)
names(constants) <- list("parent", "k", "v", "left", "right", "y")
yDistribution <- function(data, K){
PY <- matrix(0, 1, K)
for(i in c(1:K)){
PY[i] <- length(which(data[, ncol(data)] == i))/nrow(data)
}
return(PY)
}
impurityGini <- function(PY){
i <- 1 - sum(PY^2)
return(i)
}
CART <- function(tree, data, K, impurity, index, constants){
rows <- dim(data)[1]
columns <- dim(data)[2]
columns <- columns - 1
if(length(tree) == 1){
tree <- matrix(0, 1, 6)
}
PY <- yDistribution(data, K)
y <- which.max(PY)
tree[index, constants$y] <- y
i <- impurityGini(PY)
if(i == 0){
return()
}
uniques <- integer(columns)
for(j in c(1:columns)){
uniques[j] <- length(unique(data[, j]))
}
S <- matrix(0, sum(uniques) - columns, 2)
offs <- 1
for(k in c(1:columns)){
u <- sort(unique(data[, k]))
S[offs:(offs + uniques[k] - 2), 1] <- k
S[offs:(offs + uniques[k] - 2), 2] <- (u[1:length(u)-1]+u[2:length(u)])/2
offs <- offs + uniques[k] - 1
}
E <- matrix(0, length(S[, 1]), 1)
for(j in 1:length(S[, 1])){
k <- S[j, 1]
v <- S[j, 2]
indexL <- which(data[, k] < v)
indexR <- which(data[, k] >= v)
impurityL <- impurityGini(yDistribution(matrix(data[indexL, length(data)], length(indexL), 1), k))
impurityR <- impurityGini(yDistribution(matrix(data[indexR, length(data)], length(indexR), 1), k))
E[j] <- length(indexL)/rows * impurityL + length(indexR)/rows * impurityR
}
j <- which.min(E)
minE <- E[j]
if(minE >= i){
return()
}
k <- S[j, 1]
v <- S[j, 2]
indexL <- matrix(which(data[, k] < v), length(which(data[, k] < v)), 1)
indexR <- matrix(which(data[, k] >= v), length(which(data[, k] >= v)), 1)
tree[index, constants$k] <- k
tree[index, constants$v] <- v
size_tree <- dim(tree)[1]
tree[index, constants$left] <- size_tree + 1
tree[index, constants$right] <- size_tree + 2
tree <- rbind(tree, c(index, 0, 0, 0, 0, 0))
tree <- rbind(tree, c(index, 0, 0, 0, 0, 0))
print(tree)
tree <- CART(tree, data[indexL,], K, impurity, size_tree + 1, constants)
tree <- CART(tree, data[indexR,], K, impurity, size_tree + 2, constants)
return(tree)
}