在R中按条件创建其他变量

时间:2019-08-21 15:10:37

标签: r data.table

说说我的数据

mydata=structure(list(dt_l = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "06.10.2018 1:57", class = "factor"), 
    id = c(39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 
    39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 
    39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 39474239L
    ), date_appr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "22.06.2019 20:05", class = "factor"), 
    date_call = structure(c(3L, 4L, 2L, 5L, 6L, 7L, 8L, 8L, 9L, 
    10L, 11L, 1L, 12L, 13L, 13L, 14L, 15L), .Label = c("03.11.2018 0:04", 
    "06.10.2018 19:41", "06.10.2018 2:01", "06.10.2018 2:02", 
    "06.10.2018 20:04", "06.10.2018 21:30", "06.10.2018 21:31", 
    "06.10.2018 22:48", "07.10.2018 22:08", "07.10.2018 22:09", 
    "09.10.2018 23:26", "11.03.2019 22:44", "22.06.2019 19:09", 
    "22.06.2019 19:40", "22.06.2019 19:45"), class = "factor"), 
    X = c(7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 
    7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L
    ), goods = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Flekosteel", class = "factor"), 
    login = structure(c(6L, 6L, 2L, 4L, 9L, 9L, 5L, 5L, 3L, 3L, 
    7L, 1L, 8L, 11L, 11L, 10L, 11L), .Label = c("COLOP1AM-1135", 
    "COLOP1AM-722", "COLOP1AM-723", "COLOP1PM-1158", "COLOP1PM-1160", 
    "COLOP1PM-1200", "COLOP1PM-1210", "COLOP1PM-1212", "COLOP1PM-1287", 
    "COLOP1PM-1844", "COLOP1PM-1867"), class = "factor"), sex = structure(c(2L, 
    2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    1L), .Label = c("female", "male"), class = "factor"), age = c(25L, 
    25L, 27L, 22L, 29L, 29L, 37L, 37L, 30L, 30L, 21L, 45L, 33L, 
    21L, 21L, 21L, 21L), country = structure(c(1L, 1L, 1L, 2L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Colombia", 
    "Colombia"), class = "factor")), .Names = c("dt_l", "id", "date_appr", 
"date_call", "X", "goods", "login", "sex", "age", "country"), class = "data.frame", row.names = c(NA, -17L))

如何为组id + goods + sex + country设置标记变量

创建answer变量,其中最后一行的所有行均应标记为0,而最后一行仅标记为1

              dt_l       id        date_appr        date_call    X      goods         login
1  06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200
2  06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200

      sex age  country answer
1    male  25 Colombia      0
2    male  25 Colombia      1

如果组ID +货物+性别国家仅在0ne行,则应将其标记为“不正确的数据”

例如

24.08.2017 10:26    21409266    24.08.2017 16:39    \N  59de92f6d28f32d15fd2201911d27a2e    Hammer of Thor  \N  \N  \N  \N  incorrect data

该怎么做?

3 个答案:

答案 0 :(得分:3)

因此,您要标记每个(idgoodssexcountry)组的最后一行吗?如果是这样,则可以检查行号向量.I是否等于组last(.I)中的最后一个行号。

library(data.table)
setDT(mydata)

mydata[, answer := as.integer(.I == last(.I)), .(id, goods, sex, country)][]

#                dt_l       id        date_appr        date_call    X      goods         login    sex age  country answer
#  1: 06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200   male  25 Colombia      0
#  2: 06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200   male  25 Colombia      0
#  3: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 19:41 7954 Flekosteel  COLOP1AM-722 female  27 Colombia      0
#  4: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 20:04 7954 Flekosteel COLOP1PM-1158 female  22 Colombia      1
#  5: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:30 7954 Flekosteel COLOP1PM-1287 female  29 Colombia      0
#  6: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:31 7954 Flekosteel COLOP1PM-1287 female  29 Colombia      0
#  7: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160   male  37 Colombia      0
#  8: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160   male  37 Colombia      0
#  9: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:08 7954 Flekosteel  COLOP1AM-723 female  30 Colombia      0
# 10: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:09 7954 Flekosteel  COLOP1AM-723 female  30 Colombia      0
# 11: 06.10.2018 1:57 39474239 22.06.2019 20:05 09.10.2018 23:26 7954 Flekosteel COLOP1PM-1210 female  21 Colombia      0
# 12: 06.10.2018 1:57 39474239 22.06.2019 20:05  03.11.2018 0:04 7954 Flekosteel COLOP1AM-1135 female  45 Colombia      0
# 13: 06.10.2018 1:57 39474239 22.06.2019 20:05 11.03.2019 22:44 7954 Flekosteel COLOP1PM-1212 female  33 Colombia      0
# 14: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female  21 Colombia      0
# 15: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female  21 Colombia      0
# 16: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:40 7954 Flekosteel COLOP1PM-1844   male  21 Colombia      1
# 17: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:45 7954 Flekosteel COLOP1PM-1867 female  21 Colombia      1

答案 1 :(得分:2)

另一种方式:

mydata[.N:1L, v := rowid(id, goods, sex, country) == 1L][]

               dt_l       id        date_appr        date_call    X      goods         login    sex age  country     v
 1: 06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200   male  25 Colombia FALSE
 2: 06.10.2018 1:57 39474239 22.06.2019 20:05  06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200   male  25 Colombia FALSE
 3: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 19:41 7954 Flekosteel  COLOP1AM-722 female  27 Colombia FALSE
 4: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 20:04 7954 Flekosteel COLOP1PM-1158 female  22 Colombia  TRUE
 5: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:30 7954 Flekosteel COLOP1PM-1287 female  29 Colombia FALSE
 6: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:31 7954 Flekosteel COLOP1PM-1287 female  29 Colombia FALSE
 7: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160   male  37 Colombia FALSE
 8: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160   male  37 Colombia FALSE
 9: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:08 7954 Flekosteel  COLOP1AM-723 female  30 Colombia FALSE
10: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:09 7954 Flekosteel  COLOP1AM-723 female  30 Colombia FALSE
11: 06.10.2018 1:57 39474239 22.06.2019 20:05 09.10.2018 23:26 7954 Flekosteel COLOP1PM-1210 female  21 Colombia FALSE
12: 06.10.2018 1:57 39474239 22.06.2019 20:05  03.11.2018 0:04 7954 Flekosteel COLOP1AM-1135 female  45 Colombia FALSE
13: 06.10.2018 1:57 39474239 22.06.2019 20:05 11.03.2019 22:44 7954 Flekosteel COLOP1PM-1212 female  33 Colombia FALSE
14: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female  21 Colombia FALSE
15: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female  21 Colombia FALSE
16: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:40 7954 Flekosteel COLOP1PM-1844   male  21 Colombia  TRUE
17: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:45 7954 Flekosteel COLOP1PM-1867 female  21 Colombia  TRUE

行ID对一个组中的行进行计数-1,2,3,...-因此,将数据向后1:.N进行排序,我们将找到每个组的最后一行。

答案 2 :(得分:2)

另一个受Frank rev命令和注释启发的选项:

setDT(mydata)[, v := !duplicated(.SD, fromLast=TRUE), .SDcols=c("id","goods","sex","country")]

计时代码:

set.seed(0L)
nr <- 1e6
mydata = data.table(id=sample(nr/2, nr, TRUE), 
    goods=sample(1:1e3, nr, TRUE), 
    sex=sample(c(0,1), nr, TRUE), 
    country=sample(1:200, nr, TRUE))
setorder(mydata, country, id, sex, goods)
DT1 <- copy(mydata)
DT2 <- copy(mydata)
DT3 <- copy(mydata)

bench::mark(
    DT1[, v := .I == last(.I), .(id, goods, sex, country)],
    DT2[.N:1L, v := rowid(id, goods, sex, country) == 1L],
    DT3[, v := !duplicated(.SD, fromLast=TRUE), .SDcols=c("id","goods","sex","country")]
)

时间:

# A tibble: 3 x 14
  expression                        min     mean   median      max `itr/sec` mem_alloc  n_gc n_itr total_time result      memory   time  gc     
  <chr>                        <bch:tm> <bch:tm> <bch:tm> <bch:tm>     <dbl> <bch:byt> <dbl> <int>   <bch:tm> <list>      <list>   <lis> <list> 
1 DT1[, `:=`(v, .I == last(.I~    1.64s    1.64s    1.64s    1.64s     0.611    30.5MB    24     1      1.64s <data.tabl~ <Rprofm~ <bch~ <tibbl~
2 DT2[.N:1L, `:=`(v, rowid(id~ 110.46ms  117.1ms  115.8ms 129.99ms     8.54     45.8MB     0     5    585.5ms <data.tabl~ <Rprofm~ <bch~ <tibbl~
3 "DT3[, `:=`(v, !duplicated(~  98.05ms 114.88ms 100.43ms 152.26ms     8.70     26.7MB     0     5    574.4ms <data.tabl~ <Rprofm~ <bch~ <tibbl~