如何使用两个变量和出现频率对数据帧进行聚类

时间:2018-05-24 12:31:13

标签: r cluster-analysis distance k-means hclust

假设我有一个包含样本的数据框:

structure(list(V1 = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 
    6L, 6L, 6L, 7L, 7L, 7L, 9L, 9L, 9L, 13L, 13L, 13L, 15L, 15L, 
    18L, 22L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 
    26L, 27L, 27L, 28L, 32L, 32L, 32L, 32L, 32L, 36L, 36L, 36L, 36L, 
    36L, 36L, 36L, 37L, 37L, 37L, 37L, 37L, 37L, 38L, 38L, 38L, 38L, 
    38L, 39L, 39L, 39L, 39L, 40L, 40L, 40L, 41L, 41L, 42L, 45L, 45L, 
    45L, 45L, 47L, 47L, 47L, 48L, 50L, 50L, 51L, 53L, 53L, 54L), 
        V2 = c(2L, 7L, 20L, 3L, 5L, 6L, 7L, 13L, 15L, 18L, 19L, 20L, 
        4L, 5L, 6L, 7L, 9L, 12L, 6L, 9L, 12L, 13L, 15L, 18L, 7L, 
        9L, 13L, 15L, 18L, 9L, 20L, 44L, 12L, 27L, 44L, 15L, 18L, 
        58L, 16L, 18L, 19L, 23L, 27L, 28L, 29L, 32L, 45L, 47L, 50L, 
        51L, 52L, 53L, 54L, 55L, 28L, 29L, 29L, 45L, 47L, 53L, 54L, 
        55L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 38L, 39L, 40L, 41L, 
        42L, 43L, 39L, 40L, 41L, 42L, 43L, 40L, 41L, 42L, 43L, 41L, 
        42L, 43L, 42L, 43L, 43L, 47L, 53L, 54L, 55L, 53L, 54L, 55L, 
        49L, 51L, 52L, 52L, 54L, 55L, 55L), N = c(1L, 1L, 1L, 1L, 
        1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
        1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(NA, 
    -104L), class = c("data.table", "data.frame"))

我应该如何构建它以便能够使用cor(),dist()和hclust()来聚类这些?

您的信息V1和V2是材料编号。 N =它们以相同顺序一起出现的频率。 我也可以将N改为0到1之间的数字来表示相关性,如果这样可以更容易。

据我所知,我必须首先将其更改为矩阵,看起来像这样,其中V1可以是行,V2可以是列,N是值。但我不知道

  1 2 3 4 ...
1 0 1 1 4
2 1 0 2 2
3 1 4 0 1
4 1 0 3 0
...

1 个答案:

答案 0 :(得分:0)

  

“据我所知,我必须首先将其更改为矩阵,看起来像这样,其中V1可以是行,V2可以是列,N是值。”

你需要一个距离矩阵来聚类analasys,你的definitión不是距离矩阵。

我认为你希望一起出现的材料更接近,而那些不能同时出现的材料更远,所以我建议材料之间的距离当它们一起出现时为1 / N,而当它们不出现时则为2。然后你会有这样的事情:

> ## df is your data.frame
> dd <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> dd <- 1/dd
> dd[is.na(dd)] <- 2
> diag(dd) <- 0
> dd <- as.dist(dd)
> dd
     1   2   3   4   5   6   7   9  ...
2  1.0                                                                                                                                    
3  2.0 1.0                                                                                                                                
4  2.0 2.0 0.5                                                                                                                            
5  2.0 1.0 1.0 2.0                                                                                                                        
6  2.0 1.0 1.0 2.0 0.5                                                                                                                    
7  1.0 0.5 1.0 2.0 2.0 1.0                                                                                                                
9  2.0 2.0 0.5 2.0 1.0 1.0 0.5                                                                                                            
...
> hc <- hclust(dd)
> plot(hc)

enter image description here

具有相似性矩阵的示例

> ss <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> ss <- ss/max(ss, na.rm = TRUE)
> ss[is.na(ss)] <- 0
> diag(ss) <- 1
> ss
      1   2   3 4   5   6   7   9  ...
 1: 1.0 0.5 0.0 0 0.0 0.0 0.5 0.0 
 2: 0.5 1.0 0.5 0 0.5 0.5 1.0 0.0 
 3: 0.0 0.5 1.0 1 0.5 0.5 0.5 1.0 
 4: 0.0 0.0 1.0 1 0.0 0.0 0.0 0.0 
 5: 0.0 0.5 0.5 0 1.0 1.0 0.0 0.5 
 6: 0.0 0.5 0.5 0 1.0 1.0 0.5 0.5 
 7: 0.5 1.0 0.5 0 0.0 0.5 1.0 1.0 
 8: 0.0 0.0 1.0 0 0.5 0.5 1.0 1.0 
 9: 0.0 0.0 0.5 0 0.5 0.0 0.0 0.5 
10: 0.0 0.5 0.0 0 0.5 0.5 0.0 0.0  
...
> dd <- as.dist(1 - ss)
> dd
     1   2   3   4   5   6   7   9  ...
2  0.5                              
3  1.0 0.5                          
4  1.0 1.0 0.0                     
5  1.0 0.5 0.5 1.0                                                                                                                        
6  1.0 0.5 0.5 1.0 0.0           
7  0.5 0.0 0.5 1.0 1.0 0.5         
9  1.0 1.0 0.0 1.0 0.5 0.5 0.0  
...
> hc2 <- hclust(dd)
> plot(hc2)

enter image description here

PAM示例:

> # hclust - 5
> cl <- cutree(hc2, 5)
> summary(as.factor(cl))
  1   2   3   4   5 
562   1   1   2   1 
> 
> # pam - 5 with dd
> pam1 <- pam(dd, 5)
> summary(as.factor(pam1$clustering))
  1   2   3   4   5 
402 105  22  21  17 
> 
> # pam - 5 with sqrt(ss)
> dd2 <- as.dist(1 - sqrt(ss))
> pam2 <- pam(dd2, 5)
> summary(as.factor(pam2$clustering))
  1   2   3   4   5 
362  95  23  61  26