我想检索一个序列的不同状态,摆脱重复但仍然保持顺序。让我举个例子来解释一下。
我的2个序列看起来像这样:
library(reshape2)
library(dplyr)
dt %>% melt(id.vars = 'id') %>% arrange(id)
id variable value
1 1 t1 Domestic
2 1 t2 Domestic
3 1 t3 Domestic
4 1 t4 Domestic
5 1 t5 Domestic
6 1 t6 Eat
7 1 t7 Eat
8 1 t8 Eat
9 1 t9 Eat
10 1 t10 Eat
11 1 t11 Domestic
12 2 t1 SocialContacts
13 2 t2 Travel
14 2 t3 Travel
15 2 t4 Domestic
16 2 t5 Travel
17 2 t6 Travel
18 2 t7 Domestic
19 2 t8 Domestic
20 2 t9 Domestic
21 2 t10 Travel
22 2 t11 Travel
我想要的是这个(输出想要)
id value
(int) (chr)
1 Domestic
1 Eat
1 Domestic
2 SocialContacts
2 Travel
2 Domestic
2 Travel
2 Domestic
2 Travel
到目前为止,我只实现了这个目标:
dt %>% melt(id.vars = 'id') %>% group_by(id, value) %>% arrange(id) %>% distinct()
id variable value
(int) (fctr) (chr)
1 1 t1 Domestic
2 1 t6 Eat
3 2 t4 Domestic
4 2 t1 SocialContacts
5 2 t2 Travel
有什么想法吗?
dt = structure(list(t1 = c("Domestic", "SocialContacts"), t2 = c("Domestic",
"Travel"), t3 = c("Domestic", "Travel"), t4 = c("Domestic", "Domestic"
), t5 = c("Domestic", "Travel"), t6 = c("Eat", "Travel"), t7 = c("Eat",
"Domestic"), t8 = c("Eat", "Domestic"), t9 = c("Eat", "Domestic"
), t10 = c("Eat", "Travel"), t11 = c("Domestic", "Travel"), id = 1:2), .Names= c("t1",
"t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11",
"id"), row.names = 1:2, class = "data.frame")
答案 0 :(得分:3)
替代@ Psidom的dplyr
代替:
input <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), variable = c("t1",
"t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11",
"t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10",
"t11"), value = c("Domestic", "Domestic", "Domestic", "Domestic",
"Domestic", "Eat", "Eat", "Eat", "Eat", "Eat", "Domestic", "SocialContacts",
"Travel", "Travel", "Domestic", "Travel", "Travel", "Domestic",
"Domestic", "Domestic", "Travel", "Travel")), .Names = c("id",
"variable", "value"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22"))
代码:
library(dplyr)
input %>%
mutate(grp = cumsum(value != lag(value, default = value[1]))) %>%
group_by(grp) %>%
slice(1)
# Source: local data frame [9 x 4]
# Groups: grp [9]
# id variable value grp
# <int> <chr> <chr> <int>
# 1 1 t1 Domestic 0
# 2 1 t6 Eat 1
# 3 1 t11 Domestic 2
# 4 2 t1 SocialContacts 3
# 5 2 t2 Travel 4
# 6 2 t4 Domestic 5
# 7 2 t5 Travel 6
# 8 2 t7 Domestic 7
# 9 2 t10 Travel 8
答案 1 :(得分:2)
使用rleid
中的data.table
:
library(data.table)
library(dplyr)
dt %>% melt(id.vars = 'id') %>% arrange(id) %>% group_by(id, rleid = rleid(value)) %>%
summarise(value = unique(value)) %>% select(-rleid)
# id value
# 1 1 Domestic
# 6 1 Eat
# 11 1 Domestic
# 12 2 SocialContacts
# 13 2 Travel
# 15 2 Domestic
# 16 2 Travel
# 18 2 Domestic
# 21 2 Travel
使用data.table
的类似方法:
library(data.table)
unique(melt(setDT(dt), id.vars = 'id')[order(id)]
[, .(value), .(id, rleid(value))])[, -'rleid', with=F]
# id value
# 1: 1 Domestic
# 2: 1 Eat
# 3: 1 Domestic
# 4: 2 SocialContacts
# 5: 2 Travel
# 6: 2 Domestic
# 7: 2 Travel
# 8: 2 Domestic
# 9: 2 Travel
答案 2 :(得分:2)
以下是data.table
解决方案:
library(data.table)
setDT(dt)
# get secondary id with rleid
dt[, id2:=rleid(value)]
# subset to first rows in secondary id
dt[dt[, .I[1L], by="id2"]$V1,][, id2 := NULL][]
打印出来
id variable value
1: 1 t1 Domestic
2: 1 t6 Eat
3: 1 t11 Domestic
4: 2 t1 SocialContacts
5: 2 t2 Travel
6: 2 t4 Domestic
7: 2 t5 Travel
8: 2 t7 Domestic
9: 2 t10 Travel
数据强>
dt <- read.table(header=T, text=" id variable value
1 1 t1 Domestic
2 1 t2 Domestic
3 1 t3 Domestic
4 1 t4 Domestic
5 1 t5 Domestic
6 1 t6 Eat
7 1 t7 Eat
8 1 t8 Eat
9 1 t9 Eat
10 1 t10 Eat
11 1 t11 Domestic
12 2 t1 SocialContacts
13 2 t2 Travel
14 2 t3 Travel
15 2 t4 Domestic
16 2 t5 Travel
17 2 t6 Travel
18 2 t7 Domestic
19 2 t8 Domestic
20 2 t9 Domestic
21 2 t10 Travel
22 2 t11 Travel")