r - 检索不同的序列模式

时间:2016-07-01 17:25:45

标签: r distinct sequence

我想检索一个序列的不同状态,摆脱重复但仍然保持顺序。让我举个例子来解释一下。

我的2个序列看起来像这样:

library(reshape2) 
library(dplyr)    

dt %>% melt(id.vars = 'id') %>% arrange(id)

   id variable          value
1   1       t1       Domestic
2   1       t2       Domestic
3   1       t3       Domestic
4   1       t4       Domestic
5   1       t5       Domestic
6   1       t6            Eat
7   1       t7            Eat
8   1       t8            Eat
9   1       t9            Eat
10  1      t10            Eat
11  1      t11       Domestic

12  2       t1 SocialContacts
13  2       t2         Travel
14  2       t3         Travel
15  2       t4       Domestic
16  2       t5         Travel
17  2       t6         Travel
18  2       t7       Domestic
19  2       t8       Domestic
20  2       t9       Domestic
21  2      t10         Travel
22  2      t11         Travel

我想要的是这个(输出想要

     id             value
    (int)           (chr)
      1             Domestic
      1             Eat
      1             Domestic

      2             SocialContacts
      2             Travel
      2             Domestic
      2             Travel
      2             Domestic
      2             Travel

到目前为止,我只实现了这个目标:

dt %>% melt(id.vars = 'id') %>% group_by(id, value) %>% arrange(id) %>% distinct()

     id variable          value
   (int)   (fctr)          (chr)
1     1       t1       Domestic
2     1       t6            Eat
3     2       t4       Domestic
4     2       t1 SocialContacts
5     2       t2         Travel

有什么想法吗?

dt = structure(list(t1 = c("Domestic", "SocialContacts"), t2 = c("Domestic", 
"Travel"), t3 = c("Domestic", "Travel"), t4 = c("Domestic", "Domestic"
), t5 = c("Domestic", "Travel"), t6 = c("Eat", "Travel"), t7 = c("Eat", 
"Domestic"), t8 = c("Eat", "Domestic"), t9 = c("Eat", "Domestic"
), t10 = c("Eat", "Travel"), t11 = c("Domestic", "Travel"), id = 1:2), .Names= c("t1", 
"t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11", 
"id"), row.names = 1:2, class = "data.frame")

3 个答案:

答案 0 :(得分:3)

替代@ Psidom的dplyr代替:

input <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), variable = c("t1", 
"t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11", 
"t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", 
"t11"), value = c("Domestic", "Domestic", "Domestic", "Domestic", 
"Domestic", "Eat", "Eat", "Eat", "Eat", "Eat", "Domestic", "SocialContacts", 
"Travel", "Travel", "Domestic", "Travel", "Travel", "Domestic", 
"Domestic", "Domestic", "Travel", "Travel")), .Names = c("id", 
"variable", "value"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19", "20", "21", "22"))

代码:

library(dplyr)
input %>%
  mutate(grp = cumsum(value != lag(value, default = value[1]))) %>%
  group_by(grp) %>%
  slice(1)
# Source: local data frame [9 x 4]
# Groups: grp [9]
#      id variable          value   grp
#   <int>    <chr>          <chr> <int>
# 1     1       t1       Domestic     0
# 2     1       t6            Eat     1
# 3     1      t11       Domestic     2
# 4     2       t1 SocialContacts     3
# 5     2       t2         Travel     4
# 6     2       t4       Domestic     5
# 7     2       t5         Travel     6
# 8     2       t7       Domestic     7
# 9     2      t10         Travel     8

答案 1 :(得分:2)

使用rleid中的data.table

library(data.table)
library(dplyr)
dt %>% melt(id.vars = 'id') %>% arrange(id) %>% group_by(id, rleid = rleid(value)) %>% 
       summarise(value = unique(value)) %>% select(-rleid)

#    id          value
# 1   1       Domestic
# 6   1            Eat
# 11  1       Domestic
# 12  2 SocialContacts
# 13  2         Travel
# 15  2       Domestic
# 16  2         Travel
# 18  2       Domestic
# 21  2         Travel

使用data.table的类似方法:

library(data.table)
unique(melt(setDT(dt), id.vars = 'id')[order(id)]
  [, .(value), .(id, rleid(value))])[, -'rleid', with=F]

#    id          value
# 1:  1       Domestic
# 2:  1            Eat
# 3:  1       Domestic
# 4:  2 SocialContacts
# 5:  2         Travel
# 6:  2       Domestic
# 7:  2         Travel
# 8:  2       Domestic
# 9:  2         Travel

答案 2 :(得分:2)

以下是data.table解决方案:

library(data.table)    
setDT(dt)
# get secondary id with rleid
dt[, id2:=rleid(value)]
# subset to first rows in secondary id
dt[dt[, .I[1L], by="id2"]$V1,][, id2 := NULL][]

打印出来

   id variable          value
1:  1       t1       Domestic
2:  1       t6            Eat
3:  1      t11       Domestic
4:  2       t1 SocialContacts
5:  2       t2         Travel
6:  2       t4       Domestic
7:  2       t5         Travel
8:  2       t7       Domestic
9:  2      t10         Travel

数据

dt <- read.table(header=T, text="   id variable          value
1   1       t1       Domestic
2   1       t2       Domestic
3   1       t3       Domestic
4   1       t4       Domestic
5   1       t5       Domestic
6   1       t6            Eat
7   1       t7            Eat
8   1       t8            Eat
9   1       t9            Eat
10  1      t10            Eat
11  1      t11       Domestic
12  2       t1 SocialContacts
13  2       t2         Travel
14  2       t3         Travel
15  2       t4       Domestic
16  2       t5         Travel
17  2       t6         Travel
18  2       t7       Domestic
19  2       t8       Domestic
20  2       t9       Domestic
21  2      t10         Travel
22  2      t11         Travel")