如何创建一个新列,以通过Letter
+ Year
的唯一组合来累积标识Month
列中新值的外观?
数据样本。
require(data.table)
dt <- data.table(Letter = c(LETTERS[c(5, 1:2, 1:2, 1:4, 3:6)]),
Year = 2018,
Month = c(rep(5,5), rep(6,4), rep(7,4)))
打印。
Letter Year Month
1: E 2018 5
2: A 2018 5
3: B 2018 5
4: A 2018 5
5: B 2018 5
6: A 2018 6
7: B 2018 6
8: C 2018 6
9: D 2018 6
10: C 2018 7
11: D 2018 7
12: E 2018 7
13: F 2018 7
我想要得到的结果:
Letter Year Month New
1: E 2018 5 TRUE
2: A 2018 5 TRUE
3: B 2018 5 TRUE
4: A 2018 5 TRUE
5: B 2018 5 TRUE
6: A 2018 6 FALSE
7: B 2018 6 FALSE
8: C 2018 6 TRUE
9: D 2018 6 TRUE
10: C 2018 7 FALSE
11: D 2018 7 FALSE
12: E 2018 7 FALSE
13: F 2018 7 TRUE
详细问题:
答案 0 :(得分:5)
初始化为FALSE;然后加入每个字母的第一个Year-Month并更新为TRUE:
dt[, v := FALSE]
dt[unique(dt, by="Letter"), on=.(Letter, Year, Month), v := TRUE][]
Letter Year Month v
1: E 2018 5 TRUE
2: A 2018 5 TRUE
3: B 2018 5 TRUE
4: A 2018 5 TRUE
5: B 2018 5 TRUE
6: A 2018 6 FALSE
7: B 2018 6 FALSE
8: C 2018 6 TRUE
9: D 2018 6 TRUE
10: C 2018 7 FALSE
11: D 2018 7 FALSE
12: E 2018 7 FALSE
13: F 2018 7 TRUE
答案 1 :(得分:3)
简单地:
# dt[,new := ifelse(Letter %in% dt$Letter[dt$Month<Month],F,T), by="Month"][]
# Letter Year Month new
#1: E 2018 5 TRUE
#2: A 2018 5 TRUE
#3: B 2018 5 TRUE
#4: A 2018 5 TRUE
#5: B 2018 5 TRUE
#6: A 2018 6 FALSE
#7: B 2018 6 FALSE
#8: C 2018 6 TRUE
#9: D 2018 6 TRUE
#10: C 2018 7 FALSE
#11: D 2018 7 FALSE
#12: E 2018 7 FALSE
#13: F 2018 7 TRUE
使用David A.的非常有效的评论,它是一个更快,更简洁的版本:(推荐)
dt[, new := !(Letter %in% dt$Letter[dt$Month<Month]), by=Month][]
答案 2 :(得分:2)
另一种可能的方法:
dupes <- c()
dt[, New := {
x <- !Letter %chin% dupes
dupes <- c(dupes, unique(Letter[x]))
x
}, by=.(Year, Month)]
一些参考时间如下:
如果Letter是整数:
library(microbenchmark)
microbenchmark(mtd0=dt0[, New := !(Letter %in% dt0$Letter[dt0$Month<Month]), by=Month],
mtd1={
dt1[, v := FALSE]
dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
},
mtd2={
dupes <- c()
dt2[, New := {
x <- !Letter %in% dupes
dupes <- c(dupes, unique(Letter[x]))
x
}, by=.(Year, Month)]
},
times=3L)
整数定时输出:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0 1293.3100 1318.775 1331.7129 1344.2398 1350.9143 1357.589 3
mtd1 377.1534 391.178 402.4423 405.2026 415.0868 424.971 3
mtd2 2015.2115 2020.926 2023.7209 2026.6400 2027.9756 2029.311 3
如果字母是字符:
microbenchmark(mtd0=dt0[, New := !(Letter %chin% dt0$Letter[dt0$Month<Month]), by=Month],
mtd1={
dt1[, v := FALSE]
dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
},
mtd2={
dupes <- c()
dt2[, New := {
x <- !Letter %chin% dupes
dupes <- c(dupes, unique(Letter[x]))
x
}, by=.(Year, Month)]
},
times=3L)
定时输出:
Unit: milliseconds
expr min lq mean median uq max neval
mtd0 1658.5806 1689.8941 1765.9329 1721.2076 1819.6090 1918.0105 3
mtd1 849.2361 851.1807 852.8632 853.1253 854.6768 856.2283 3
mtd2 420.1013 426.0941 433.9202 432.0869 440.8296 449.5723 3
检查:
> identical(dt2$New, dt1$v)
[1] TRUE
> identical(dt0$New, dt1$v)
[1] FALSE
数据:
set.seed(0L)
nr <- 1e7
dt <- unique(data.table(Letter=sample(nr/1e2, nr, replace=TRUE),
Year=sample(2014:2018, nr, replace=TRUE),
Month=sample(1:12, nr, replace=TRUE)))
setorder(dt, Year, Month)#[, Letter := as.character(Letter)]
dt0 <- copy(dt)
dt1 <- copy(dt)
dt2 <- copy(dt)
#for seed=0L, dt has about 4.8mio rows