我正在进行聚类分析,我想计算修剪树的叶子中某个变量的出现次数。下面是一个简化示例,其中修剪树只有三个分支。我现在想知道三个不同分支/叶子中的As和B的数量。我怎么能得到那些?
rm(list=ls(all=TRUE))
mylabels <- matrix(nrow=1, ncol = 20)
mylabels[1,1:10] <- ("A")
mylabels[1,11:20] <- ("B")
myclusterdata <- matrix(rexp(100, rate=.1), ncol=100, nrow=20)
rownames(myclusterdata)<-mylabels
hc <- hclust(dist(myclusterdata), "ave")
memb <- cutree(hc, k = 3)
cent <- NULL
for(k in 1:3){
cent <- rbind(cent, colMeans(myclusterdata[memb == k, , drop = FALSE]))
}
hc1 <- hclust(dist(cent)^2, method = "cen", members = table(memb))
# whole tree
plot(as.dendrogram(hc),horiz=T)
# pruned tree (only 3 branches)
plot(as.dendrogram(hc1),horiz=T)
答案 0 :(得分:0)
好的,我明白了。叶子的元素在memb ...所以重新排列它们并结合它提供结果。以下是示例
的代码rm(list=ls(all=TRUE))
mylabels <- matrix(nrow=1, ncol = 20)
mylabels[1,1:10] <- ("A")
mylabels[1,11:20] <- ("B")
myclusterdata <- matrix(rexp(100, rate=.1), ncol=100, nrow=20)
rownames(myclusterdata)<-mylabels
hc <- hclust(dist(myclusterdata), "ave")
memb <- cutree(hc, k = 3)
cent <- NULL
for(k in 1:3){
cent <- rbind(cent, colMeans(myclusterdata[memb == k, , drop = FALSE]))
}
hc1 <- hclust(dist(cent)^2, method = "cen", members = table(memb))
# whole tree
plot(as.dendrogram(hc),horiz=T)
# pruned tree (only 3 branches)
plot(as.dendrogram(hc1),horiz=T)
# identify the percentages of A and B
var_of_interest <- levels(as.factor(names(memb)))
leaf_number <- levels(as.factor(memb))
counter <- matrix(nrow=length(leaf_number), ncol = length(var_of_interest))
for (i in seq(1:length(leaf_number))) {
for (j in seq(1:length(var_of_interest))) {
counter[i,j] <- length(memb[names(memb)==var_of_interest[j] & memb==leaf_number[i]])
}
}
counter[,2]/(counter[,1]+counter[,2])