说说我的数据
mydata=structure(list(dt_l = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "06.10.2018 1:57", class = "factor"),
id = c(39474239L, 39474239L, 39474239L, 39474239L, 39474239L,
39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 39474239L,
39474239L, 39474239L, 39474239L, 39474239L, 39474239L, 39474239L
), date_appr = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "22.06.2019 20:05", class = "factor"),
date_call = structure(c(3L, 4L, 2L, 5L, 6L, 7L, 8L, 8L, 9L,
10L, 11L, 1L, 12L, 13L, 13L, 14L, 15L), .Label = c("03.11.2018 0:04",
"06.10.2018 19:41", "06.10.2018 2:01", "06.10.2018 2:02",
"06.10.2018 20:04", "06.10.2018 21:30", "06.10.2018 21:31",
"06.10.2018 22:48", "07.10.2018 22:08", "07.10.2018 22:09",
"09.10.2018 23:26", "11.03.2019 22:44", "22.06.2019 19:09",
"22.06.2019 19:40", "22.06.2019 19:45"), class = "factor"),
X = c(7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L,
7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L, 7954L
), goods = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Flekosteel", class = "factor"),
login = structure(c(6L, 6L, 2L, 4L, 9L, 9L, 5L, 5L, 3L, 3L,
7L, 1L, 8L, 11L, 11L, 10L, 11L), .Label = c("COLOP1AM-1135",
"COLOP1AM-722", "COLOP1AM-723", "COLOP1PM-1158", "COLOP1PM-1160",
"COLOP1PM-1200", "COLOP1PM-1210", "COLOP1PM-1212", "COLOP1PM-1287",
"COLOP1PM-1844", "COLOP1PM-1867"), class = "factor"), sex = structure(c(2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("female", "male"), class = "factor"), age = c(25L,
25L, 27L, 22L, 29L, 29L, 37L, 37L, 30L, 30L, 21L, 45L, 33L,
21L, 21L, 21L, 21L), country = structure(c(1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Colombia",
"Colombia"), class = "factor")), .Names = c("dt_l", "id", "date_appr",
"date_call", "X", "goods", "login", "sex", "age", "country"), class = "data.frame", row.names = c(NA, -17L))
如何为组id + goods + sex + country
设置标记变量
创建answer
变量,其中最后一行的所有行均应标记为0,而最后一行仅标记为1
dt_l id date_appr date_call X goods login
1 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200
2 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200
sex age country answer
1 male 25 Colombia 0
2 male 25 Colombia 1
如果组ID +货物+性别国家仅在0ne行,则应将其标记为“不正确的数据”
例如
24.08.2017 10:26 21409266 24.08.2017 16:39 \N 59de92f6d28f32d15fd2201911d27a2e Hammer of Thor \N \N \N \N incorrect data
该怎么做?
答案 0 :(得分:3)
因此,您要标记每个(id
,goods
,sex
,country
)组的最后一行吗?如果是这样,则可以检查行号向量.I
是否等于组last(.I)
中的最后一个行号。
library(data.table)
setDT(mydata)
mydata[, answer := as.integer(.I == last(.I)), .(id, goods, sex, country)][]
# dt_l id date_appr date_call X goods login sex age country answer
# 1: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200 male 25 Colombia 0
# 2: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200 male 25 Colombia 0
# 3: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 19:41 7954 Flekosteel COLOP1AM-722 female 27 Colombia 0
# 4: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 20:04 7954 Flekosteel COLOP1PM-1158 female 22 Colombia 1
# 5: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:30 7954 Flekosteel COLOP1PM-1287 female 29 Colombia 0
# 6: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:31 7954 Flekosteel COLOP1PM-1287 female 29 Colombia 0
# 7: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160 male 37 Colombia 0
# 8: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160 male 37 Colombia 0
# 9: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:08 7954 Flekosteel COLOP1AM-723 female 30 Colombia 0
# 10: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:09 7954 Flekosteel COLOP1AM-723 female 30 Colombia 0
# 11: 06.10.2018 1:57 39474239 22.06.2019 20:05 09.10.2018 23:26 7954 Flekosteel COLOP1PM-1210 female 21 Colombia 0
# 12: 06.10.2018 1:57 39474239 22.06.2019 20:05 03.11.2018 0:04 7954 Flekosteel COLOP1AM-1135 female 45 Colombia 0
# 13: 06.10.2018 1:57 39474239 22.06.2019 20:05 11.03.2019 22:44 7954 Flekosteel COLOP1PM-1212 female 33 Colombia 0
# 14: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female 21 Colombia 0
# 15: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female 21 Colombia 0
# 16: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:40 7954 Flekosteel COLOP1PM-1844 male 21 Colombia 1
# 17: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:45 7954 Flekosteel COLOP1PM-1867 female 21 Colombia 1
答案 1 :(得分:2)
另一种方式:
mydata[.N:1L, v := rowid(id, goods, sex, country) == 1L][]
dt_l id date_appr date_call X goods login sex age country v
1: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 2:01 7954 Flekosteel COLOP1PM-1200 male 25 Colombia FALSE
2: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 2:02 7954 Flekosteel COLOP1PM-1200 male 25 Colombia FALSE
3: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 19:41 7954 Flekosteel COLOP1AM-722 female 27 Colombia FALSE
4: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 20:04 7954 Flekosteel COLOP1PM-1158 female 22 Colombia TRUE
5: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:30 7954 Flekosteel COLOP1PM-1287 female 29 Colombia FALSE
6: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 21:31 7954 Flekosteel COLOP1PM-1287 female 29 Colombia FALSE
7: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160 male 37 Colombia FALSE
8: 06.10.2018 1:57 39474239 22.06.2019 20:05 06.10.2018 22:48 7954 Flekosteel COLOP1PM-1160 male 37 Colombia FALSE
9: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:08 7954 Flekosteel COLOP1AM-723 female 30 Colombia FALSE
10: 06.10.2018 1:57 39474239 22.06.2019 20:05 07.10.2018 22:09 7954 Flekosteel COLOP1AM-723 female 30 Colombia FALSE
11: 06.10.2018 1:57 39474239 22.06.2019 20:05 09.10.2018 23:26 7954 Flekosteel COLOP1PM-1210 female 21 Colombia FALSE
12: 06.10.2018 1:57 39474239 22.06.2019 20:05 03.11.2018 0:04 7954 Flekosteel COLOP1AM-1135 female 45 Colombia FALSE
13: 06.10.2018 1:57 39474239 22.06.2019 20:05 11.03.2019 22:44 7954 Flekosteel COLOP1PM-1212 female 33 Colombia FALSE
14: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female 21 Colombia FALSE
15: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:09 7954 Flekosteel COLOP1PM-1867 female 21 Colombia FALSE
16: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:40 7954 Flekosteel COLOP1PM-1844 male 21 Colombia TRUE
17: 06.10.2018 1:57 39474239 22.06.2019 20:05 22.06.2019 19:45 7954 Flekosteel COLOP1PM-1867 female 21 Colombia TRUE
行ID对一个组中的行进行计数-1,2,3,...-因此,将数据向后1:.N
进行排序,我们将找到每个组的最后一行。
答案 2 :(得分:2)
另一个受Frank rev命令和注释启发的选项:
setDT(mydata)[, v := !duplicated(.SD, fromLast=TRUE), .SDcols=c("id","goods","sex","country")]
计时代码:
set.seed(0L)
nr <- 1e6
mydata = data.table(id=sample(nr/2, nr, TRUE),
goods=sample(1:1e3, nr, TRUE),
sex=sample(c(0,1), nr, TRUE),
country=sample(1:200, nr, TRUE))
setorder(mydata, country, id, sex, goods)
DT1 <- copy(mydata)
DT2 <- copy(mydata)
DT3 <- copy(mydata)
bench::mark(
DT1[, v := .I == last(.I), .(id, goods, sex, country)],
DT2[.N:1L, v := rowid(id, goods, sex, country) == 1L],
DT3[, v := !duplicated(.SD, fromLast=TRUE), .SDcols=c("id","goods","sex","country")]
)
时间:
# A tibble: 3 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <lis> <list>
1 DT1[, `:=`(v, .I == last(.I~ 1.64s 1.64s 1.64s 1.64s 0.611 30.5MB 24 1 1.64s <data.tabl~ <Rprofm~ <bch~ <tibbl~
2 DT2[.N:1L, `:=`(v, rowid(id~ 110.46ms 117.1ms 115.8ms 129.99ms 8.54 45.8MB 0 5 585.5ms <data.tabl~ <Rprofm~ <bch~ <tibbl~
3 "DT3[, `:=`(v, !duplicated(~ 98.05ms 114.88ms 100.43ms 152.26ms 8.70 26.7MB 0 5 574.4ms <data.tabl~ <Rprofm~ <bch~ <tibbl~