出于速度原因,我正在尝试将数据操作代码从dplyr
转换为data.table
。我几乎在那里,但错过了最后一步。
我有一些示例数据来复制我的问题。
c_dt = data.table(u_id=rep(c("u1", "u2"),each=5),
p_id=c("p1", "p1", "p1", "p2","p2", "p1", "p2", "p2", "p2", "p2" ),
c_dt=c("2015-12-01", "2015-12-02", "2015-12-03", "2015-12-02",
"2015-12-05", "2015-12-02", "2015-12-03", "2015-12-04",
"2015-12-05", "2015-12-06"))
我希望确定u_id
和p_id
重复的行;并且只保留最小c_dt
的行(基本上保留第一个实例)。我使用以下dplyr
代码:
c_df <- as.data.frame(c_dt)
cdedup_df <- c_df %>% group_by(p_id, u_id) %>% filter(c_dt == min(c_dt))
哪个给出以下输出
> cdedup_df
Source: local data frame [4 x 3]
Groups: p_id, u_id
u_id p_id c_dt
1 u1 p1 2015-12-01
2 u1 p2 2015-12-02
3 u2 p1 2015-12-02
4 u2 p2 2015-12-03
我有以下data.table
代码,可正确识别所需的行,但我无法弄清楚如何只过滤和行。
cdedup_dt <- c_dt[,c_dt == min(c_dt),by = list(u_id, p_id)]
cdedup_dt
u_id p_id V1
1: u1 p1 TRUE
2: u1 p1 FALSE
3: u1 p1 FALSE
4: u1 p2 TRUE
5: u1 p2 FALSE
6: u2 p1 TRUE
7: u2 p2 TRUE
8: u2 p2 FALSE
9: u2 p2 FALSE
10: u2 p2 FALSE
答案 0 :(得分:3)
这样的事情可以解决问题:
c_dt[, list(c_dt=min(c_dt)), by=list(u_id, p_id)]
## u_id p_id c_dt
## 1: u1 p1 2015-12-01
## 2: u1 p2 2015-12-02
## 3: u2 p1 2015-12-02
## 4: u2 p2 2015-12-03
答案 1 :(得分:3)
在我的方法之下。我希望它可以更好地扩展到大数据集,因为min
没有group
,只有单一排序,data.table非常有效,然后首先按组子集。
setorderv(c_dt, "c_dt")[, .SD[1L], .(u_id, p_id)]
# in data.table 1.9.7+ you can also use `head`
setorderv(c_dt, "c_dt")[, head(.SD, 1L), .(u_id, p_id)]
以下代码包括当前其他答案的验证 如果OP将提供大数据集,我可以添加基准。
library(data.table)
c_dt = data.table(u_id=rep(c("u1", "u2"),each=5), p_id=c("p1", "p1", "p1", "p2","p2", "p1", "p2", "p2", "p2", "p2" ), c_dt=c("2015-12-01", "2015-12-02", "2015-12-03", "2015-12-02", "2015-12-05", "2015-12-02", "2015-12-03", "2015-12-04", "2015-12-05", "2015-12-06"))
zero = c_dt[, list(c_dt=min(c_dt)), by=list(u_id, p_id)]
ananda = c_dt[, list(c_dt = c_dt[c_dt == min(c_dt)]), by = .(u_id, p_id)]
tal = c_dt[, .SD[rank(c_dt, ties.method = c("first")) == 1],by = .(u_id, p_id)]
all.equal(zero, ananda)
#[1] TRUE
all.equal(ananda, tal)
#[1] TRUE
jan = setorderv(c_dt, "c_dt")[, .SD[1L], .(u_id, p_id)]
all.equal(tal, jan)
#[1] TRUE
答案 2 :(得分:0)
所以你确实很亲密。您遗失的只是在 j 列中传递.SD
。让我们看看它是如何运作的:
library(data.table)
c_dt = data.table(u_id=rep(c("u1", "u2"),each=5),
p_id=c("p1", "p1", "p1", "p2","p2", "p1", "p2", "p2", "p2", "p2" ),
c_dt=c("2015-12-01", "2015-12-02",
"2015-12-03", "2015-12-02", "2015-12-05",
"2015-12-02", "2015-12-03", "2015-12-04",
"2015-12-05", "2015-12-06"))
c_dt
u_id p_id c_dt
1: u1 p1 2015-12-01
2: u1 p1 2015-12-02
3: u1 p1 2015-12-03
4: u1 p2 2015-12-02
5: u1 p2 2015-12-05
6: u2 p1 2015-12-02
7: u2 p2 2015-12-03
8: u2 p2 2015-12-04
9: u2 p2 2015-12-05
10: u2 p2 2015-12-06
现在,我们将按>> u_id 和 p_id 进行分组,并按 c_df 的最小值进行过滤:
cdedup_dt <- c_dt[ , .SD[c_dt == min(c_dt)], by = .(u_id, p_id)]
cdedup_dt
u_id p_id c_dt
1: u1 p1 2015-12-01
2: u1 p2 2015-12-02
3: u2 p1 2015-12-02
4: u2 p2 2015-12-03
请注意,.(u_id, p_id)
等于list(u_id, p_id)
,.SD
是指每个组的Data.table的子集。你所遗忘的只是.SD
正如@ zero323 min 所提到的那样,将保留重复项(这基本上意味着我们的示例中有一些重复的行)。如果您只希望为每个组保留一条记录,则更安全的选择是使用排名功能:
cdedup_dt <- c_dt[, .SD[rank(c_dt, ties.method = c("first")) == 1],by = .(u_id, p_id)]
cdedup_dt
u_id p_id c_dt
1: u1 p1 2015-12-01
2: u1 p2 2015-12-02
3: u2 p1 2015-12-02
4: u2 p2 2015-12-03