实现K-Means算法问题

时间:2017-04-28 15:11:36

标签: r

我试图实现K-means但代码有问题,因为我没有得到结果。

这是欧几里德距离计算:

euclid <- function(p1, p2) {
   dm <- matrix(NA, nrow=dim(p1)[1], ncol=dim(p2)[1])
   for(i in 1:nrow(p2)) {
     dm[,i] <- sqrt(rowSums(t(t(p1)-p2[i,])^2))
   }
   dm
}

这是K-mean算法。

K_means <- function(dt, c, itr) {

clusterHistory <- vector(itr, mode="list")
centerHistory  <- vector(itr, mode="list")

for(i in 1:itr) {
   distsToCenters <- euclid(dt,c)
   clusters <- apply(distsToCenters, 1, which.min)
   centers <- apply(dt, 2, tapply, clusters, mean)
   # Saving history
   clusterHistory[[i]] <- clusters
   centerHistory[[i]] <- c
}

list(clusters=clusterHistory, c = centerHistory)
}

这些是我的中心:

c <- data.frame(Label = c("A","C","F","M"), X = c(3,2.1,3,5), Y =c(6.1,5.0,5.0,5.0)) 

这些是我的全部观点:

dt2 <- data.frame(Label = c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T"), X = c(3.0,4.0,2.1,4.0,7.0,3.0,6.1,7.0,4.0,3.0,6.2,7.0,5.0,3.5,2.5,3.5,5.5,6.0,0.5,0.8), Y =c(6.1,2.0,5.0,6.0,3.0,5.0,4.0,2.0,1.5,2.0,2.0,3.0,5.0,4.5,6.0,5.5,4.5,1.0,1.5,1.2)) 

显示结果:

res <- K_means(dt2[,2:3], c[,2:3], 4)
res

我想在适当的群集中放置适当的标签,但它不起作用。

1 个答案:

答案 0 :(得分:2)

BTW如果假设这是一个自学问题,假设R中有kmeans函数。如果是这样,datacamp(第1章免费)上有关于kmeans的良好课程

问题似乎与你的euclid功能有关,特别是这一行

dm[,i] <- sqrt(rowSums(t(t(p1)-p2[i,])^2))

这为dm

提供了以下结果
      [,1] [,2] [,3] [,4]
 [1,]    0  0.9  0.0  2.0
 [2,]    0  1.1  1.1  1.1
 [3,]    0  0.9  0.0  2.0
 [4,]    0  1.1  1.1  1.1
 [5,]    0  0.9  0.0  2.0
 [6,]    0  1.1  1.1  1.1
 [7,]    0  0.9  0.0  2.0
 [8,]    0  1.1  1.1  1.1
 [9,]    0  0.9  0.0  2.0
[10,]    0  1.1  1.1  1.1
[11,]    0  0.9  0.0  2.0
[12,]    0  1.1  1.1  1.1
[13,]    0  0.9  0.0  2.0
[14,]    0  1.1  1.1  1.1
[15,]    0  0.9  0.0  2.0
[16,]    0  1.1  1.1  1.1
[17,]    0  0.9  0.0  2.0
[18,]    0  1.1  1.1  1.1
[19,]    0  0.9  0.0  2.0
[20,]    0  1.1  1.1  1.1

我替换了这一行

euclid <- function(p1, p2) {
    dm <- matrix(NA, nrow=dim(p1)[1], ncol=dim(p2)[1])
    for(i in 1:nrow(p2)) {
        dm[,i] <- sqrt(rowSums(sapply(1:ncol(p1), function(c) {(p1[,c] - p2[i,c])^2})))
    }
    dm
}

这使用sapply迭代每列。

这给出了

          [,1]     [,2]      [,3]      [,4]
 [1,] 0.000000 1.421267 1.1000000 2.2825424
 [2,] 4.220190 3.551056 3.1622777 3.1622777
 [3,] 1.421267 0.000000 0.9000000 2.9000000
 [4,] 1.004988 2.147091 1.4142136 1.4142136
 [5,] 5.060632 5.292447 4.4721360 2.8284271
 [6,] 1.100000 0.900000 0.0000000 2.0000000
 [7,] 3.744329 4.123106 3.2572995 1.4866069
 [8,] 5.728001 5.745433 5.0000000 3.6055513
 [9,] 4.707441 3.982462 3.6400549 3.6400549
[10,] 4.100000 3.132092 3.0000000 3.6055513
[11,] 5.200961 5.080354 4.3863424 3.2310989
[12,] 5.060632 5.292447 4.4721360 2.8284271
[13,] 2.282542 2.900000 2.0000000 0.0000000
[14,] 1.676305 1.486607 0.7071068 1.5811388
[15,] 0.509902 1.077033 1.1180340 2.6925824
[16,] 0.781025 1.486607 0.7071068 1.5811388
[17,] 2.968164 3.436568 2.5495098 0.7071068
[18,] 5.916925 5.586591 5.0000000 4.1231056
[19,] 5.235456 3.848376 4.3011626 5.7008771
[20,] 5.371220 4.016217 4.3908997 5.6639209

已编辑 - 标签

要获取群集标签,建议您将包含标签的整个data.frame传递到K_means函数,如下所示。

请注意,我还更改了代码,以便在后续迭代中使用新的中心。在您之前的代码中,您始终使用最初传递的中心。

K_means <- function(dt, c, itr) {

clusterHistory <- vector(itr, mode="list")
centerHistory  <- vector(itr, mode="list")

dt_labels <- dt[,1]
dt_data <- dt[,-1]

c_labels <- c[,1]
c_data <- c[,-1]

for(i in 1:itr) {
    distsToCenters <- euclid(dt_data,c_data)
    clusters <- apply(distsToCenters, 1, which.min)
    c_data <- apply(dt_data, 2, tapply, clusters, mean)
    # Saving history
    clusterHistory[[i]] <- data.frame(Label = dt_labels, cluster_label = c_labels[clusters] ,cluster_number = clusters)
    centerHistory[[i]] <- data.frame(Label = c_labels, c_data)

}

list(clusters=clusterHistory, c = centerHistory)
}

更改调用以传递整个data.frame。我假设标签在第1列。

c <- data.frame(Label = c("A","C","F","M"), X = c(3,2.1,3,5), Y =c(6.1,5.0,5.0,5.0)) 
dt2 <- data.frame(Label = c("A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T"), X = c(3.0,4.0,2.1,4.0,7.0,3.0,6.1,7.0,4.0,3.0,6.2,7.0,5.0,3.5,2.5,3.5,5.5,6.0,0.5,0.8), Y =c(6.1,2.0,5.0,6.0,3.0,5.0,4.0,2.0,1.5,2.0,2.0,3.0,5.0,4.5,6.0,5.5,4.5,1.0,1.5,1.2)) 

res <- K_means(dt2, c, 4)
res

这给出了以下结果

$clusters
$clusters[[1]]
   Label cluster_label cluster_number
1      A             A              1
2      B             F              3
3      C             C              2
4      D             A              1
5      E             M              4
6      F             F              3
7      G             M              4
8      H             M              4
9      I             F              3
10     J             F              3
11     K             M              4
12     L             M              4
13     M             M              4
14     N             F              3
15     O             A              1
16     P             F              3
17     Q             M              4
18     R             M              4
19     S             C              2
20     T             C              2

$clusters[[2]]
   Label cluster_label cluster_number
1      A             A              1
2      B             F              3
3      C             A              1
4      D             A              1
5      E             M              4
6      F             A              1
7      G             M              4
8      H             M              4
9      I             F              3
10     J             F              3
11     K             M              4
12     L             M              4
13     M             A              1
14     N             F              3
15     O             A              1
16     P             A              1
17     Q             M              4
18     R             M              4
19     S             C              2
20     T             C              2

$clusters[[3]]
   Label cluster_label cluster_number
1      A             A              1
2      B             F              3
3      C             A              1
4      D             A              1
5      E             M              4
6      F             A              1
7      G             M              4
8      H             M              4
9      I             F              3
10     J             F              3
11     K             M              4
12     L             M              4
13     M             A              1
14     N             A              1
15     O             A              1
16     P             A              1
17     Q             M              4
18     R             M              4
19     S             C              2
20     T             C              2

$clusters[[4]]
   Label cluster_label cluster_number
1      A             A              1
2      B             F              3
3      C             A              1
4      D             A              1
5      E             M              4
6      F             A              1
7      G             M              4
8      H             M              4
9      I             F              3
10     J             F              3
11     K             M              4
12     L             M              4
13     M             A              1
14     N             A              1
15     O             A              1
16     P             A              1
17     Q             M              4
18     R             M              4
19     S             C              2
20     T             C              2


$c
$c[[1]]
  Label        X        Y
1     A 3.166667 6.033333
2     C 1.133333 2.566667
3     F 3.500000 3.416667
4     M 6.225000 3.062500

$c[[2]]
  Label     X        Y
1     A 3.300 5.514286
2     C 0.650 1.350000
3     F 3.625 2.500000
4     M 6.400 2.785714

$c[[3]]
  Label        X        Y
1     A 3.325000 5.387500
2     C 0.650000 1.350000
3     F 3.666667 1.833333
4     M 6.400000 2.785714

$c[[4]]
  Label        X        Y
1     A 3.325000 5.387500
2     C 0.650000 1.350000
3     F 3.666667 1.833333
4     M 6.400000 2.785714