在R

时间:2017-04-19 14:50:07

标签: r data.table lubridate

我有一个包含重复记录的数据集,可以由一个组确定。我希望在最早的记录(按日期)之后将任何内容标记为重复(或者如果日期相同则为第一行.id。)

library(data.table)
library(lubridate)

groupA <- c("A","B","C","A","B","C","D","E","A")
groupB <-  c("y","n","n","y","y","n","y","n","y")
#ymd format
date <- c("2017-04-01","2017-02-01","2017-03-01","2017-01-01","2017-05-01","2017-03-01","2017-07-01","2017-08-01","2017-09-01")

mydata <- data.table(groupA, groupB, date=ymd(date))
check.dups <- mydata[,.("count"=.N),by=.(groupA,groupB)]
#These are the duplicate keys
check.dups <- check.dups[count>1,]

#Create dupliate.flag on most recent example for duplicates
keycols <- c("groupA","groupB")
setkeyv(mydata, keycols)
setkeyv(check.dups, keycols)

我坚持在最早的日期/第一行之后选择行的逻辑。用于创建重复标志的文件。

#Select rows for duplicate flag
mydata[check.dups,][date > min(date),dup.flag := ]

任何帮助非常感谢。

预期产出:

由于日期而导致的标志,因为row.id而标记了C(日期相同)

 groupA groupB date       dup.flag
 A      y      2017-04-01  y
 B      n      2017-02-01  NA
 C      n      2017-03-01  NA
 A      y      2017-01-01  NA
 B      y      2017-05-01  NA
 C      n      2017-03-01  y
 D      y      2017-07-01  NA
 E      n      2017-08-01  NA
 A      y      2017-09-01  y

1 个答案:

答案 0 :(得分:2)

请尝试duplicated()包中的data.table功能:

setkey(mydata, groupA, groupB, date)
mydata[, dup := duplicated(mydata, by = c("groupA", "groupB"))]

mydata
#   groupA groupB       date   dup
#1:      A      y 2017-01-01 FALSE
#2:      A      y 2017-04-01  TRUE
#3:      A      y 2017-09-01  TRUE
#4:      B      n 2017-02-01 FALSE
#5:      B      y 2017-05-01 FALSE
#6:      C      n 2017-03-01 FALSE
#7:      C      n 2017-03-01  TRUE
#8:      D      y 2017-07-01 FALSE
#9:      E      n 2017-08-01 FALSE