R中data.table中的组累积标识新值

时间:2018-09-20 10:55:57

标签: r dataframe dplyr data.table tidyverse

如何创建一个新列,以通过Letter + Year的唯一组合来累积标识Month列中新值的外观?

数据样本。

require(data.table)
dt <- data.table(Letter = c(LETTERS[c(5, 1:2, 1:2, 1:4, 3:6)]),
                 Year = 2018,
                 Month = c(rep(5,5), rep(6,4), rep(7,4)))

打印。

    Letter Year Month
 1:      E 2018     5
 2:      A 2018     5
 3:      B 2018     5
 4:      A 2018     5
 5:      B 2018     5
 6:      A 2018     6
 7:      B 2018     6
 8:      C 2018     6
 9:      D 2018     6
10:      C 2018     7
11:      D 2018     7
12:      E 2018     7
13:      F 2018     7

我想要得到的结果:

    Letter Year Month   New
 1:      E 2018     5  TRUE
 2:      A 2018     5  TRUE
 3:      B 2018     5  TRUE
 4:      A 2018     5  TRUE
 5:      B 2018     5  TRUE
 6:      A 2018     6 FALSE
 7:      B 2018     6 FALSE
 8:      C 2018     6  TRUE
 9:      D 2018     6  TRUE
10:      C 2018     7 FALSE
11:      D 2018     7 FALSE
12:      E 2018     7 FALSE
13:      F 2018     7  TRUE

详细问题:

  1. Group1(“ E”,“ A”,“ B”,“ A”,“ B”)默认情况下均为TRUE,无需进行比较。
  2. 第2组中的哪个字母(“ A”,“ B”,“ C”,“ D”)在第1组中没有重复。
  3. 然后,第1组和第2组(“ E”,“ A”,“ B”,“ A”)中第3组中的哪个字母(“ C”,“ D”,“ E”,“ F”)不重复,“ B”,“ A”,“ B”,“ C”,“ D”)。

3 个答案:

答案 0 :(得分:5)

初始化为FALSE;然后加入每个字母的第一个Year-Month并更新为TRUE:

dt[, v := FALSE]
dt[unique(dt, by="Letter"), on=.(Letter, Year, Month), v := TRUE][]

    Letter Year Month     v
 1:      E 2018     5  TRUE
 2:      A 2018     5  TRUE
 3:      B 2018     5  TRUE
 4:      A 2018     5  TRUE
 5:      B 2018     5  TRUE
 6:      A 2018     6 FALSE
 7:      B 2018     6 FALSE
 8:      C 2018     6  TRUE
 9:      D 2018     6  TRUE
10:      C 2018     7 FALSE
11:      D 2018     7 FALSE
12:      E 2018     7 FALSE
13:      F 2018     7  TRUE

答案 1 :(得分:3)

简单地:

 # dt[,new := ifelse(Letter %in% dt$Letter[dt$Month<Month],F,T), by="Month"][]

 #   Letter Year Month   new
 #1:      E 2018     5  TRUE
 #2:      A 2018     5  TRUE
 #3:      B 2018     5  TRUE
 #4:      A 2018     5  TRUE
 #5:      B 2018     5  TRUE
 #6:      A 2018     6 FALSE
 #7:      B 2018     6 FALSE
 #8:      C 2018     6  TRUE
 #9:      D 2018     6  TRUE
#10:      C 2018     7 FALSE
#11:      D 2018     7 FALSE
#12:      E 2018     7 FALSE
#13:      F 2018     7  TRUE

使用David A.的非常有效的评论,它是一个更快,更简洁的版本:(推荐

dt[, new := !(Letter %in% dt$Letter[dt$Month<Month]), by=Month][]

答案 2 :(得分:2)

另一种可能的方法:

dupes <- c()
dt[, New := {
    x <- !Letter %chin% dupes
    dupes <- c(dupes, unique(Letter[x]))
    x
}, by=.(Year, Month)]

一些参考时间如下:

如果Letter是整数:

library(microbenchmark)
microbenchmark(mtd0=dt0[, New := !(Letter %in% dt0$Letter[dt0$Month<Month]), by=Month],
    mtd1={
        dt1[, v := FALSE]
        dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
    },
    mtd2={
        dupes <- c()
        dt2[, New := {
            x <- !Letter %in% dupes
            dupes <- c(dupes, unique(Letter[x]))
            x
        }, by=.(Year, Month)]        
    },
    times=3L)

整数定时输出:

Unit: milliseconds
 expr       min       lq      mean    median        uq      max neval
 mtd0 1293.3100 1318.775 1331.7129 1344.2398 1350.9143 1357.589     3
 mtd1  377.1534  391.178  402.4423  405.2026  415.0868  424.971     3
 mtd2 2015.2115 2020.926 2023.7209 2026.6400 2027.9756 2029.311     3

如果字母是字符:

microbenchmark(mtd0=dt0[, New := !(Letter %chin% dt0$Letter[dt0$Month<Month]), by=Month],
    mtd1={
        dt1[, v := FALSE]
        dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
    },
    mtd2={
        dupes <- c()
        dt2[, New := {
            x <- !Letter %chin% dupes
            dupes <- c(dupes, unique(Letter[x]))
            x
        }, by=.(Year, Month)]        
    },
    times=3L)

定时输出:

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
 mtd0 1658.5806 1689.8941 1765.9329 1721.2076 1819.6090 1918.0105     3
 mtd1  849.2361  851.1807  852.8632  853.1253  854.6768  856.2283     3
 mtd2  420.1013  426.0941  433.9202  432.0869  440.8296  449.5723     3

检查:

> identical(dt2$New, dt1$v)
[1] TRUE
> identical(dt0$New, dt1$v)
[1] FALSE

数据:

set.seed(0L)
nr <- 1e7
dt <- unique(data.table(Letter=sample(nr/1e2, nr, replace=TRUE),
    Year=sample(2014:2018, nr, replace=TRUE),
    Month=sample(1:12, nr, replace=TRUE)))
setorder(dt, Year, Month)#[, Letter := as.character(Letter)]
dt0 <- copy(dt)
dt1 <- copy(dt)
dt2 <- copy(dt)

#for seed=0L, dt has about 4.8mio rows