和相关变量

时间:2016-10-04 17:00:08

标签: r dataframe correlation

我有200个变量的列表,我想总结那些高度相关的变量。

假设这是我的数据

mydata <- structure(list(APPLE= c(1L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 43L), 
                         PEAR= c(2L, 2L, 5L, 4L, 366L, 65L, 43L, 456L, 876L, 78L, 687L, 378L, 378L, 34L, 53L, 41L), 
                         PLUM = c(10L, 20L, 10L, 20L, 10L, 20L, 1L, 0L, 1L, 2010L,20L, 10L, 10L, 10L, 10L, 10L), 
                         BANANA= c(2L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 1L), 
                         LEMON = c(4L, 10L, 31L, 2L, 2L, 5L, 2L, 5L, 1L, 52L, 1L, 2L, 52L, 6L, 2L, 3L)), 
                    .Names = c("APPLE", "PEAR", "PLUM", "BANANA", "LEMON"), 
                    class = "data.frame", row.names = c(NA,-16L))

我找到了这个代码,我不知道如何调整以便将其用于我的目的 https://stackoverflow.com/a/39484353/4797853

var.corelation <- cor(as.matrix(mydata), method="pearson")

library(igraph)
# prevent duplicated pairs
var.corelation <- var.corelation*lower.tri(var.corelation)
check.corelation <- which(var.corelation>0.62, arr.ind=TRUE)

graph.cor <- graph.data.frame(check.corelation, directed = FALSE)
groups.cor <- split(unique(as.vector(check.corelation)),         clusters(graph.cor)$membership)
lapply(groups.cor,FUN=function(list.cor){rownames(var.corelation)[list.cor]})

我正在寻找的输出 2个数据框如下:

DF1

 GROUP1        GROUP2    
    3             16
    4             40
ETC..

值是组内值的总和

DF2

ORIGINAL_VAR  GROUP

APPLE         1
PEAR          1
PLUM          2
BANANA        2
LEMON         2

1 个答案:

答案 0 :(得分:0)

试试这个(假设你只聚为2组):

DF1 <- cbind.data.frame(GROUP1=rowSums(mydata[,groups.cor[[1]]]), 
                        GROUP2=rowSums(mydata[,groups.cor[[2]]]))
DF1 

   GROUP1 GROUP2
1       3     16
2       4     40
3      10     72
4       8     24
5     732     14
6     130     30
7      86      5
8     912     10
9    1752      3
10    156   2114
11   1374     22
12    756     14
13    756    114
14     68     22
15    106     14
16     84     14

DF2 <- NULL
for (i in 1:2) {
  DF2 <- rbind(DF2, 
           cbind.data.frame(ORIGINAL_VAR=rownames(var.corelation)[groups.cor[[i]]], 
           GROUP=i))
}

DF2

  ORIGINAL_VAR GROUP
1         PEAR     1
2        APPLE     1
3       BANANA     2
4        LEMON     2
5         PLUM     2