等级k表示输出

时间:2014-01-13 00:27:38

标签: r cluster-analysis k-means

我正试图找到一种对Kmeans()输出进行排名的方法。我看到了一些像下面这样的例子,其中有些人对集群距离内的排名感兴趣:

       x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
       matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
       colnames(x) <- c("x", "y")
       cl <- kmeans(x, 2)
       x <- cbind(x,cl = cl$cluster)
       #Function to apply to each cluster to 
       # do the ordering
      orderCluster <- function(i,data,centers){
      #Extract cluster and center
      dt <- data[data[,3] == i,]
      ct <- centers[i,]
      #Calculate distances
     dt <- cbind(dt,dist = apply((dt[,1:2] - ct)^2,1,sum))
      #Sort
     dt[order(dt[,4]),]
       }
      do.call(rbind,lapply(sort(unique(cl$cluster)),orderCluster,data = x,centers = cl$centers))

除了之前,我有兴趣根据主题距离彼此的总体输出进行排名。知道怎么样? 谢谢

1 个答案:

答案 0 :(得分:0)

以下代码(基本上是您发布的内容,但为了显示目的而略小)构建了一个数据框,其中距离在每个集群中进行排序:

set.seed(144)
x <- rbind(matrix(rnorm(16, sd = 0.3), ncol = 2),
           matrix(rnorm(16, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
cl <- kmeans(x, 2)
x <- cbind(x,cl = cl$cluster)
#Function to apply to each cluster to do the ordering
orderCluster <- function(i,data,centers){
  #Extract cluster and center
  dt <- data[data[,3] == i,]
  ct <- centers[i,]
  #Calculate distances
  dt <- cbind(dt,dist = apply((dt[,1:2] - ct)^2,1,sum))

  #Sort
  dt[order(dt[,4]),]
}
mat = do.call(rbind,lapply(sort(unique(cl$cluster)),orderCluster,data = x,centers = cl$centers))
mat
#                 x            y cl        dist
#  [1,] -0.14216517 -0.176358815  1 0.006587057
#  [2,] -0.42633630 -0.133045170  1 0.051628451
#  [3,]  0.04766625 -0.303396617  1 0.087914240
#  [4,]  0.18084317  0.293752160  1 0.131882291
#  [5,] -0.37685063  0.004506463  1 0.132548458
#  [6,] -0.49516685  0.044300123  1 0.145771677
#  [7,]  0.03892986  0.091644147  1 0.157442636
#  [8,] -0.53928391  0.072918569  1 0.284171983
#  [9,]  0.84816448  0.849422864  2 0.012406249
# [10,]  1.11092777  0.940139924  2 0.017545334
# [11,]  1.07090647  1.323721146  2 0.067496535
# [12,]  0.97112156  0.797174195  2 0.079830832
# [13,]  0.61441906  0.972293074  2 0.100053503
# [14,]  1.07548106  1.283839079  2 0.148824424
# [15,]  0.57826111  0.747899273  2 0.335880701
# [16,]  1.15112709  1.597604041  2 0.498957213

您正在询问如何按群集中心的距离对此数据进行排序。以下是通过增加距离来按顺序对它们进行排序的方法:

mat[order(mat[,"dist"]),]
#                 x            y cl        dist
#  [1,] -0.14216517 -0.176358815  1 0.006587057
#  [2,]  0.84816448  0.849422864  2 0.012406249
#  [3,]  1.11092777  0.940139924  2 0.017545334
#  [4,] -0.42633630 -0.133045170  1 0.051628451
#  [5,]  1.07090647  1.323721146  2 0.067496535
#  [6,]  0.97112156  0.797174195  2 0.079830832
#  [7,]  0.04766625 -0.303396617  1 0.087914240
#  [8,]  0.61441906  0.972293074  2 0.100053503
#  [9,]  0.18084317  0.293752160  1 0.131882291
# [10,] -0.37685063  0.004506463  1 0.132548458
# [11,] -0.49516685  0.044300123  1 0.145771677
# [12,]  1.07548106  1.283839079  2 0.148824424
# [13,]  0.03892986  0.091644147  1 0.157442636
# [14,] -0.53928391  0.072918569  1 0.284171983
# [15,]  0.57826111  0.747899273  2 0.335880701
# [16,]  1.15112709  1.597604041  2 0.498957213

以下是通过减少距离来按顺序对它们进行排序的方法:

mat[order(mat[,"dist"], decreasing=T),]
#                 x            y cl        dist
#  [1,]  1.15112709  1.597604041  2 0.498957213
#  [2,]  0.57826111  0.747899273  2 0.335880701
#  [3,] -0.53928391  0.072918569  1 0.284171983
#  [4,]  0.03892986  0.091644147  1 0.157442636
#  [5,]  1.07548106  1.283839079  2 0.148824424
#  [6,] -0.49516685  0.044300123  1 0.145771677
#  [7,] -0.37685063  0.004506463  1 0.132548458
#  [8,]  0.18084317  0.293752160  1 0.131882291
#  [9,]  0.61441906  0.972293074  2 0.100053503
# [10,]  0.04766625 -0.303396617  1 0.087914240
# [11,]  0.97112156  0.797174195  2 0.079830832
# [12,]  1.07090647  1.323721146  2 0.067496535
# [13,] -0.42633630 -0.133045170  1 0.051628451
# [14,]  1.11092777  0.940139924  2 0.017545334
# [15,]  0.84816448  0.849422864  2 0.012406249
# [16,] -0.14216517 -0.176358815  1 0.006587057