假设我有一个包含样本的数据框:
structure(list(V1 = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 6L,
6L, 6L, 6L, 7L, 7L, 7L, 9L, 9L, 9L, 13L, 13L, 13L, 15L, 15L,
18L, 22L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 27L, 27L, 28L, 32L, 32L, 32L, 32L, 32L, 36L, 36L, 36L, 36L,
36L, 36L, 36L, 37L, 37L, 37L, 37L, 37L, 37L, 38L, 38L, 38L, 38L,
38L, 39L, 39L, 39L, 39L, 40L, 40L, 40L, 41L, 41L, 42L, 45L, 45L,
45L, 45L, 47L, 47L, 47L, 48L, 50L, 50L, 51L, 53L, 53L, 54L),
V2 = c(2L, 7L, 20L, 3L, 5L, 6L, 7L, 13L, 15L, 18L, 19L, 20L,
4L, 5L, 6L, 7L, 9L, 12L, 6L, 9L, 12L, 13L, 15L, 18L, 7L,
9L, 13L, 15L, 18L, 9L, 20L, 44L, 12L, 27L, 44L, 15L, 18L,
58L, 16L, 18L, 19L, 23L, 27L, 28L, 29L, 32L, 45L, 47L, 50L,
51L, 52L, 53L, 54L, 55L, 28L, 29L, 29L, 45L, 47L, 53L, 54L,
55L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 38L, 39L, 40L, 41L,
42L, 43L, 39L, 40L, 41L, 42L, 43L, 40L, 41L, 42L, 43L, 41L,
42L, 43L, 42L, 43L, 43L, 47L, 53L, 54L, 55L, 53L, 54L, 55L,
49L, 51L, 52L, 52L, 54L, 55L, 55L), N = c(1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(NA,
-104L), class = c("data.table", "data.frame"))
我应该如何构建它以便能够使用cor(),dist()和hclust()来聚类这些?
您的信息V1和V2是材料编号。 N =它们以相同顺序一起出现的频率。 我也可以将N改为0到1之间的数字来表示相关性,如果这样可以更容易。
据我所知,我必须首先将其更改为矩阵,看起来像这样,其中V1可以是行,V2可以是列,N是值。但我不知道
1 2 3 4 ...
1 0 1 1 4
2 1 0 2 2
3 1 4 0 1
4 1 0 3 0
...
答案 0 :(得分:0)
“据我所知,我必须首先将其更改为矩阵,看起来像这样,其中V1可以是行,V2可以是列,N是值。”
你需要一个距离矩阵来聚类analasys,你的definitión不是距离矩阵。
我认为你希望一起出现的材料更接近,而那些不能同时出现的材料更远,所以我建议材料之间的距离当它们一起出现时为1 / N,而当它们不出现时则为2。然后你会有这样的事情:
> ## df is your data.frame
> dd <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> dd <- 1/dd
> dd[is.na(dd)] <- 2
> diag(dd) <- 0
> dd <- as.dist(dd)
> dd
1 2 3 4 5 6 7 9 ...
2 1.0
3 2.0 1.0
4 2.0 2.0 0.5
5 2.0 1.0 1.0 2.0
6 2.0 1.0 1.0 2.0 0.5
7 1.0 0.5 1.0 2.0 2.0 1.0
9 2.0 2.0 0.5 2.0 1.0 1.0 0.5
...
> hc <- hclust(dd)
> plot(hc)
具有相似性矩阵的示例
> ss <- dcast(rbind(df, df[, .(V1 = V2, V2 = V1, N)]), V1~V2, value.var = "N")[, -1]
> ss <- ss/max(ss, na.rm = TRUE)
> ss[is.na(ss)] <- 0
> diag(ss) <- 1
> ss
1 2 3 4 5 6 7 9 ...
1: 1.0 0.5 0.0 0 0.0 0.0 0.5 0.0
2: 0.5 1.0 0.5 0 0.5 0.5 1.0 0.0
3: 0.0 0.5 1.0 1 0.5 0.5 0.5 1.0
4: 0.0 0.0 1.0 1 0.0 0.0 0.0 0.0
5: 0.0 0.5 0.5 0 1.0 1.0 0.0 0.5
6: 0.0 0.5 0.5 0 1.0 1.0 0.5 0.5
7: 0.5 1.0 0.5 0 0.0 0.5 1.0 1.0
8: 0.0 0.0 1.0 0 0.5 0.5 1.0 1.0
9: 0.0 0.0 0.5 0 0.5 0.0 0.0 0.5
10: 0.0 0.5 0.0 0 0.5 0.5 0.0 0.0
...
> dd <- as.dist(1 - ss)
> dd
1 2 3 4 5 6 7 9 ...
2 0.5
3 1.0 0.5
4 1.0 1.0 0.0
5 1.0 0.5 0.5 1.0
6 1.0 0.5 0.5 1.0 0.0
7 0.5 0.0 0.5 1.0 1.0 0.5
9 1.0 1.0 0.0 1.0 0.5 0.5 0.0
...
> hc2 <- hclust(dd)
> plot(hc2)
PAM示例:
> # hclust - 5
> cl <- cutree(hc2, 5)
> summary(as.factor(cl))
1 2 3 4 5
562 1 1 2 1
>
> # pam - 5 with dd
> pam1 <- pam(dd, 5)
> summary(as.factor(pam1$clustering))
1 2 3 4 5
402 105 22 21 17
>
> # pam - 5 with sqrt(ss)
> dd2 <- as.dist(1 - sqrt(ss))
> pam2 <- pam(dd2, 5)
> summary(as.factor(pam2$clustering))
1 2 3 4 5
362 95 23 61 26