距离&具有动态时间扭曲的集群

时间:2015-12-03 23:14:43

标签: r time-series dplyr hierarchical-clustering

我正在使用dtw计算几个系列之间的距离并得到奇怪的结果。请注意,在下面的示例数据中,前9个客户是相同的集合(A == B == C,D == E == F,G == H == I)。剩下的行仅用于噪音,允许我制作8个簇。

我希望第一组将与其相同的合作伙伴聚集在一起。当我计算原始数据的距离时会发生这种情况,但是当我在距离/聚类之前缩放数据时会得到不同的结果。

原始数据中相同行之间的距离为0.0(如预期的那样),但对于缩放数据,距离不是0.0(甚至不接近)。有什么想法,为什么他们不一样?

library(TSdist)
library(dplyr)
library(tidyr)



   mydata = as_data_frame(read.table(textConnection("
cust   P1   P2   P3   P4  P5   P6   P7   P8   P9  P10
1     A  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
2     B  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
3     C  1.1  1.0  1.0  1.0 1.0  1.0  1.0  1.0  1.0  1.0
4     D  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
5     E  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
6     F  0.0  1.0  2.0  1.0 0.0  1.0  2.0  1.0  0.0  1.0
7     G  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
8     H  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
9     I  2.0  1.5  1.0  0.5 0.0  0.5  1.0  1.5  2.0  1.5
10   D2  1.0  2.0  1.0  0.0 1.0  2.0  1.0  0.0  1.0  2.0
11   E2  5.0  6.0  5.0  4.0 5.0  6.0  5.0  4.0  5.0  6.0
12   F2  9.0 10.0  9.0  8.0 9.0 10.0  9.0  8.0  9.0 10.0
13   G2  1.5  1.0  0.5  0.0 0.5  1.0  1.5  2.0  1.5  1.0
14   H2  5.5  5.0  4.5  4.0 4.5  5.0  5.5  6.0  5.5  5.0
15   I2  9.5  9.0  8.5  8.0 8.5  9.0  9.5 10.0  9.5  9.0
16   A3  1.0  1.0  0.0  2.0 1.0  1.0  1.0  1.0  1.0  1.0
17   B3  5.0  5.0  5.0  5.0 5.0  3.0  8.0  5.0  5.0  5.0
18   C3  9.0  9.0  9.0  9.0 9.0  5.4 14.4  9.0  9.0  9.0
19   D3  0.0  1.0  2.0  1.0 0.0  1.0  1.0  2.0  0.0  1.0
20   E3  4.0  5.0  5.0  6.0 4.0  5.0  6.0  5.0  4.0  5.0
21   F3  8.0  9.0 10.0  9.0 9.0  9.0  9.0  9.0  8.0  9.0
22   G3  2.0  1.5  1.0  0.5 0.0  0.5  1.0  2.0  1.5  1.5
23   H3  6.0  5.5  5.0  4.5 4.0  5.0  4.5  5.5  6.0  5.5
24   I3 10.0  9.5  9.0  9.0 8.0  8.5  9.0  9.5 10.0  9.5
25   D4  0.0  3.0  6.0  3.0 0.0  3.0  6.0  3.0  0.0  5.0
26   E4  3.0  6.0  9.0  6.0 3.0  6.0  9.0  6.0  3.0  6.0
27   F4  4.0  6.0 10.0  7.0 5.0  6.0 11.0  8.0  5.0  7.0
28   D5  5.0  0.0  3.0  6.0 3.0  0.0  3.0  6.0  3.0  0.0
29   D6  9.0  6.0  3.0  6.0 9.0  6.0  3.0  6.0  9.0  6.0
30   D7  9.0 11.0  5.0  4.0 6.0 10.0  7.0  5.0  6.0 11.0
31   Dw  0.0  0.8  1.4  2.0 1.0  0.0  2.0  0.0  1.0  2.0
32   Ew  4.0  4.8  5.4  6.0 5.0  4.0  6.0  4.0  5.0  6.0
33   Fw  8.0  8.8  9.4 10.0 9.0  8.0 10.0  8.0  9.0 10.0
34   Gw  2.0  1.5  1.0  0.5 0.0  1.0  2.0  1.5  1.3  1.1
35   Hw  6.0  5.5  5.0  4.5 4.0  5.0  6.0  5.5  5.3  5.1
36   Iw 10.0  9.5  9.0  8.5 8.0  9.0 10.0  9.5  9.3  9.1"),
                           header = TRUE, stringsAsFactors = FALSE))

k=8
# create a scale version of mydata (raw data - mean) / std dev
      mydata_long = mydata %>%
            mutate (mean = apply(mydata[,2:ncol(mydata)],1,mean,na.rm = T)) %>%
            mutate (sd = apply(mydata[,2:(ncol(mydata))],1,sd,na.rm = T))%>%
            gather (period,value,-cust,-mean,-sd) %>%
            mutate (sc = (value-mean)/sd)
      mydata_sc = mydata_long[,-c(2,3,5)] %>%
        spread(period,sc)
  # dtw
        dtw_dist = TSDatabaseDistances(mydata[2:ncol(mydata)], distance = "dtw",lag.max= 2) #distance
        dtw_clus = hclust(dtw_dist, method="ward.D2") # Cluster 
        dtw_res = data.frame(cutree(dtw_clus, k)) # cut dendrogram into 9 clusters
  # dtw (w scaled data)
        dtw_sc_dist = TSDatabaseDistances(mydata_sc[2:ncol(mydata_sc)], distance = "dtw",lag.max= 2) #distance
        dtw_sc_clus = hclust(dtw_sc_dist, method="ward.D2") # Cluster      
        dtw_sc_res = data.frame(cutree(dtw_sc_clus, k)) # cut dendrogram into 9 clusters

results = cbind (dtw_res,dtw_sc_res)
  names(results) = c("dtw", "dtw_scaled")

  print(results)

   dtw dtw_scaled
1    1          1
2    1          2
3    1          1
4    1          2
5    1          1
6    1          2
7    1          3
8    1          4
9    1          3
10   1          3
11   2          3
12   3          4
13   1          5
14   2          6
15   3          3
16   1          4
17   2          3
18   4          3
19   1          6
20   2          3
21   3          4
22   1          3
23   2          3
24   3          6
25   5          7
26   6          8
27   7          7
28   5          7
29   6          7
30   8          8
31   1          7
32   2          7
33   3          7
34   1          8
35   2          7
36   3          7

1 个答案:

答案 0 :(得分:1)

几个问题

  1. 您正在按行进行缩放,而不是按列进行缩放(请查看dplyr链的中间结果 - 它们是否有意义?)

  2. 您用于生成缩放数据的数据操作将数据框的行顺序更改为按字母顺序排列:

    > mydata_sc %>% head
    
         cust          P1          P2          P3          P4          P5         P6         P7          P8          P9         P10
      (chr)       (dbl)       (dbl)       (dbl)       (dbl)       (dbl)      (dbl)      (dbl)       (dbl)       (dbl)       (dbl)
      1     A  2.84604989 -0.31622777 -0.31622777 -0.31622777 -0.31622777 -0.3162278 -0.3162278 -0.31622777 -0.31622777 -0.31622777
      2    A3  0.00000000  0.00000000 -2.12132034  2.12132034  0.00000000  0.0000000  0.0000000  0.00000000  0.00000000  0.00000000
      3     B  2.84604989 -0.31622777 -0.31622777 -0.31622777 -0.31622777 -0.3162278 -0.3162278 -0.31622777 -0.31622777 -0.31622777
    

    VS。

      > mydata %>% head
      Source: local data frame [6 x 11]
    
             cust    P1    P2    P3    P4    P5    P6    P7    P8    P9   P10
      (chr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
      1     A   1.1     1     1     1     1     1     1     1     1     1
      2     B   1.1     1     1     1     1     1     1     1     1     1
    
  3. (检查 cust 变量排序!)

    这是我的方法,以及我认为您可以避免将来出现类似错误

    1. 使用内置缩放功能进行缩放

      mydata_sc <- mydata %>% select(-cust) %>% scale %>% as.data.frame %>% cbind(cust =mydata$cust,.) %>% as.tbl
      
    2. 断言您的缩放数据框等同于原始数据框的缩放版本:

      > (scale(mydata_sc %>% select(-cust)) - scale(mydata %>% select(-cust))) 
          %>% colSums %>% sum
      
      [1] 0.000000000000005353357 
      
    3. 创建一个单个函数来执行所需的操作:

      return_dtw <- function(df) {
        res_2 = TSDatabaseDistances(df[2:ncol(df)],distance="dtw",lag.max=2) %>%
           hclust(.,method="ward.D2")
          return(data.frame(cutree(res_2,k)))
      }
      
    4. 执行功能:

        > mydata %>% return_dtw %>% cbind(mydata_sc %>% return_dtw)
        cutree.res_2..k. cutree.res_2..k.
        1                 1                1
        2                 1                1
        3                 1                1
        4                 1                1
        5                 1                1
        6                 1                1
        7                 1                1
        8                 1                1
        9                 1                1
        10                1                1
        11                2                2
        12                3                3
        13                1                1
        14                2                2
        15                3                3
        16                1                1
        17                2                2
        18                4                3
        19                1                1
        20                2                2
        21                3                3
        22                1                1
        23                2                2
        24                3                3
        25                5                4
        26                6                5
        27                7                5
        28                5                6
        29                6                7
        30                8                8
        31                1                1
        32                2                2
        33                3                3
        34                1                1
        35                2                2
        36                3                3
      
    5. 一些后来的客户没有相似的分组,但那是另一个问题!