我有一个这样的数据框:
df <- data.frame(Timestamp=as.POSIXct(strptime(c('2018-01-08 13:26:53',
'2018-01-08 13:33:33',
'2018-01-08 13:45:12',
'2018-01-08 14:26:22',
'2018-01-08 14:28:34',
'2018-01-08 14:31:32',
'2018-01-08 15:11:14',
'2018-01-08 15:13:16',
'2018-01-08 15:25:19'), "%Y-%m-%d %H:%M:%OS")),
Text=c('A', 'A', 'A', 'B', 'C', 'C', 'A', 'A', 'B'))
输出:
Timestamp Text
1 2018-01-08 13:26:53 A
2 2018-01-08 13:33:33 A
3 2018-01-08 13:45:12 A
4 2018-01-08 14:26:22 B
5 2018-01-08 14:28:34 C
6 2018-01-08 14:31:32 C
7 2018-01-08 15:11:14 A
8 2018-01-08 15:13:16 A
9 2018-01-08 15:25:19 B
我只想删除按顺序重复的重复项,并且只保留最新的行。所以我想要一个这样的数据框:
Timestamp Text
1 2018-01-08 13:45:12 A
2 2018-01-08 14:26:22 B
3 2018-01-08 14:31:32 C
4 2018-01-08 15:13:16 A
5 2018-01-08 15:25:19 B
有什么想法吗?预先感谢!
答案 0 :(得分:3)
出于完整性考虑:使用rle
查找重复值:
df <- data.table(df)
a <- rle(df$Text)
df[, groups := rep(seq(1, length(a$lengths)), a$lengths)]
df
df[, .SD[.N, ], by = groups]
groups Timestamp Text
1: 1 2018-01-08 13:45:12 A
2: 2 2018-01-08 14:26:22 B
3: 3 2018-01-08 14:31:32 C
4: 4 2018-01-08 15:13:16 A
5: 5 2018-01-08 15:25:19 B
编辑 和基准(设置较大)
df <- data.table(df)
df <- df[, lapply(.SD, sample, size = 10000, replace = TRUE), .SDcols = colnames(df)]
aa <- function(){
a <- rle(df$Text)
df[, groups := rep(seq(1, length(a$lengths)), a$lengths)]
df[, .SD[.N, ], by = groups]
}
aa2 <- function(){
df[, group := rleid(Text)]
df[, .SD[.N, ], by = group]
}
bb <- function(){
df %>%
group_by(group = rleid(Text)) %>%
slice(which.max(Timestamp)) %>%
ungroup() %>%
select(-group)
}
cc <- function(){
df %>%
mutate(Group = cumsum(c(FALSE, df$Text[-1] != df$Text[-n()]))) %>%
group_by(Group) %>%
filter(row_number() == n()) %>%
ungroup() %>%
select(-Group)
}
> microbenchmark(aa(), aa2(), bb(), cc(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
aa() 1212.6609 1252.2010 1267.8729 1279.0700 1282.9894 1312.4432 5 c
aa2() 1213.9839 1271.1910 1275.3573 1283.8008 1299.9422 1307.8685 5 c
bb() 112.8352 116.5473 152.9838 142.4634 160.9753 232.0976 5 a
cc() 306.1699 306.4497 316.5756 315.7423 326.8091 327.7069 5 b
答案 1 :(得分:2)
我们可以使用rleid
中的data.table
创建组,并从每个组中仅选择max
Timestamp
。
library(dplyr)
library(data.table)
df %>%
group_by(group = rleid(Text)) %>%
slice(which.max(Timestamp)) %>%
ungroup() %>%
select(-group)
# Timestamp Text
# <dttm> <fct>
#1 2018-01-08 13:45:12 A
#2 2018-01-08 14:26:22 B
#3 2018-01-08 14:31:32 C
#4 2018-01-08 15:13:16 A
#5 2018-01-08 15:25:19 B
答案 2 :(得分:2)
使用Xnew=[X[:,1,:],X[:,2,:], ...]
:
data.table
答案 3 :(得分:1)
此解决方案使用软件包dplyr
。
library(dplyr)
df %>%
mutate(Group = cumsum(c(FALSE, df$Text[-1] != df$Text[-n()]))) %>%
group_by(Group) %>%
filter(row_number() == n()) %>%
ungroup() %>%
select(-Group)
## A tibble: 5 x 2
# Timestamp Text
# <dttm> <fct>
#1 2018-01-08 13:45:12 A
#2 2018-01-08 14:26:22 B
#3 2018-01-08 14:31:32 C
#4 2018-01-08 15:13:16 A
#5 2018-01-08 15:25:19 B
答案 4 :(得分:0)
使用hutils::duplicated_rows
函数要容易得多。
library(data.table)
library(hutils)
df <- data.frame(Timestamp = as.POSIXct(strptime(x = c('2018-01-08 13:26:53',
'2018-01-08 13:33:33',
'2018-01-08 13:45:12',
'2018-01-08 14:26:22',
'2018-01-08 14:28:34',
'2018-01-08 14:31:32',
'2018-01-08 15:11:14',
'2018-01-08 15:13:16',
'2018-01-08 15:25:19'),
format = "%Y-%m-%d %H:%M:%OS")),
Text = c('A', 'A', 'A', 'B', 'C', 'C', 'A', 'A', 'B'))
duplicated_rows(DT = as.data.table(x = df),
by = "Text")
#> Timestamp Text
#> 1: 2018-01-08 13:26:53 A
#> 2: 2018-01-08 13:33:33 A
#> 3: 2018-01-08 13:45:12 A
#> 4: 2018-01-08 15:11:14 A
#> 5: 2018-01-08 15:13:16 A
#> 6: 2018-01-08 14:26:22 B
#> 7: 2018-01-08 15:25:19 B
#> 8: 2018-01-08 14:28:34 C
#> 9: 2018-01-08 14:31:32 C
由reprex package(v0.2.1)于2019-03-17创建