Question

我使用了两种不同的聚类方法来生成两个聚类结果，每个聚类方法包含10个不同的组。但是，它们的编码方式不同。以下示例显示了聚类结果：

set.seed(1)

Df <- data.frame(Var1 = sample(1:6, 100, replace =T), Var2 = sample(1:6,100, replace =T))

table(Df)

我想找到这两种方法之间的百分比协议（或协议数量），并将Cluster2重新编码为Cluster1的级别，以便它们具有最大百分比协议（或案例数）。我写了一些算法来做到这一点，但在群集数量增加后却不是很成功。我的数据集有超过100000个案例。

Answer 1

这可能是一种霰弹枪方法，因为我不知道真实数据中有多少个簇。我在这里尝试所有可能的组合：

df <- data.frame(Cluster1 = c("A","A", "B", "B", "C","C", "C"), 
                 Cluster2 = c("1", "2", "3", "3", "2","1","3"))

require(gtools)
comb <- permutations(n = 3, r = 3, v = 1:3)

#try every combination and count the matches
nmatch <- apply(comb,1,function(x) sum(LETTERS[match(df$Cluster2,x)] == df$Cluster1))

#pick the best performing translation
best <- comb[which.max(nmatch),]
# generate translation table
data.frame(Cluster2 = 1:3, Cluster2new = LETTERS[best])

结果：

  Cluster2 Cluster2new
1        1           A
2        2           C
3        3           B

您的新示例数据：

set.seed(314)
df <- data.frame(Cluster1 = sample(LETTERS[1:6], 100, replace =T), Cluster2 = sample(1:6,100, replace =T))

require(gtools)
comb <- permutations(n = 6, r = 6, v = 1:6)

#try every combination and count the matches
nmatch <- apply(comb,1,function(x) sum(LETTERS[match(df$Cluster2,x)] == df$Cluster1))

#pick the best performing translation
best <- comb[which.max(nmatch),]
# generate translation table
data.frame(Cluster2 = 1:3, Cluster2new = LETTERS[best])

结果：

  Cluster2 Cluster2new
1        1           B
2        2           D
3        3           C
4        1           A
5        2           E
6        3           F

计算排列似乎是限制因素。因此，我有一个替代解决方案，随机抽样可能性，并计算匹配百分比。这种方法要快得多，但可能不会包含问题的最佳解决方案。

set.seed(314)

c = 10000
n = 10
tries = 1000

df <- data.frame(Cluster1 = sample(LETTERS[1:n], c, replace =T), Cluster2 = sample(1:n,c, replace =T))

#try every combination and count the matches
nmatch <- sapply(1:tries,function(x) {
  set.seed(x)
  comb <- sample(1:n,n)
  sum(LETTERS[match(df$Cluster2,comb)] == df$Cluster1)
  })

#pick the best performing translation
best <- which.max(nmatch)
# generate translation table
set.seed(best)
data.frame(Cluster2 = 1:n, Cluster2new = LETTERS[sample(1:n,n)])

nmatch[best]/c

结果：

   Cluster2 Cluster2new
1         1           B
2         2           J
3         3           D
4         4           C
5         5           A
6         6           G
7         7           E
8         8           F
9         9           I
10       10           H
> 
  > nmatch[best]/c
[1] 0.1099

或更慢的迭代过程：

solve <- function(start)
{
  sol <- integer()
  start <- sample(1:n)
  left <- start
  for(i in start){

    nmatch <- sapply(left, function(x) {
      cl <- df[df$Cluster2==x,]
      sum(LETTERS[cl$Cluster2] == cl$Cluster1)
    })
    ix <- which.max(nmatch)
    sol[i] <- left[ix]
    left <- left[-ix]
  }
  sol
}

nmatch <- sapply(1:tries, function(x) {
  set.seed(x)
  sum(LETTERS[match(df$Cluster2,solve(sample(1:n)))] == df$Cluster1)
})

best <- which.max(nmatch)

data.frame(Cluster2 = 1:n, Cluster2new = LETTERS[sample(1:n,n)])

nmatch[best]/c

结果：

   Cluster2 Cluster2new
1         1           D
2         2           G
3         3           C
4         4           I
5         5           E
6         6           A
7         7           B
8         8           J
9         9           F
10       10           H
>     
  >     nmatch[best]/c
[1] 0.1121

作为一个例子，当您查看每种方法的nmatch分布时，第二个随机过程可能会更好地获得一个好的解决方案：

Answer 2

在考虑之后，我想我找到了一个简单的答案。我可以简单地使用一个循环来修剪它并找到匹配。

set.seed (1)
df <- data.frame(Cluster1 = sample(LETTERS[1:n], c, replace =T), Cluster2 = sample(1:n,c, replace =T))
findmatch <- function(df, group1 = "Cluster1", group2 = "Cluster2" ) {
    n <- length(unique(df[, group1]))
    matches <- matrix(NA, n, 2) 
    for(i in 1:n) {
        if(i==1) {
        table1 <- table(df[, group1], df[,group2])
        } else if(i<n) {
        table1 <- table1[-maxs[1],-maxs[2]]  
       } 
       maxs <- which(table1 == max(table1), arr.ind = TRUE)
       if(i < n) {
       matches[i,1:2] <- c(rownames(table1)[maxs[1]], colnames(table1)[maxs[2]])    
       } else {
         matches[i,1:2] <- c(rownames(table1)[-maxs[1]], colnames(table1)[-maxs[2]])    
     }
   }
   return(matches)
 }
findmatch(df=df)


      [,1] [,2]
 [1,] "J"  "5" 
 [2,] "I"  "7" 
 [3,] "A"  "6" 
 [4,] "E"  "3" 
 [5,] "D"  "10"
 [6,] "C"  "8" 
 [7,] "B"  "1" 
 [8,] "F"  "9" 
 [9,] "H"  "2" 
[10,] "G"  "4"

编码两种不同的聚类方法

2 个答案: