R + data.table子集化数据集基于3列的组合并仅保留第一行

时间:2015-12-05 07:05:20

标签: r data.table

我对该主题有一个简短的跟进问题: R subsetting dataframe based on the combination of 3 columns and excluding duplicate combinations

我有这样的数据集:

Experiment  Sequence    Parameter   Time
Exp1        JJJJ        2           10     *
Exp1        JJJJ        2           11     *
Exp1        JJJJ        2           12     *
Exp2        JJJJ        2           13
Exp3        JJJJ        2           15
Exp1        ZZZZ        3           12    
Exp2        ZZZZ        3           23     *
Exp2        ZZZZ        3           23.5   *
Exp2        ZZZZ        3           24     *
Exp3        ZZZZ        3           15
.....

现在,对于每个序列,我只想保留第一个独特的Experiment + Sequence + Parameter组合。 实际上,减少标有*的行,最终得到如下数据集:

Experiment  Sequence    Parameter   Time
Exp1        JJJJ        2           10     *
Exp2        JJJJ        2           13    
Exp3        JJJJ        2           15
Exp1        ZZZZ        3           12
Exp2        ZZZZ        3           23     *
Exp3        ZZZZ        3           15
.....

我想使用data.table包,这真是太棒了,我想出了这个解决方案。这需要相当长的时间,所以我想知道是否有更好/最快的方式/语法。

keycols = c("Sequence","Parameter","Experiment")
setkeyv(DT,keycols) 
DT <- DT[,head(.SD,1), by = key(DT)]

2 个答案:

答案 0 :(得分:0)

下面将上述两条评论中提出的两种方法的速度与以下方法进行比较。

setkey(DT,Sequence,Parameter,Experiment)
DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]

首先,我生成一个包含5000万行的data.table。

Sequ  <-   c("AAAA","BBBB","CCCC","DDDD","EEEE","FFFF","GGGG","HHHH","IIII","JJJJ")
DT <- as.data.table(cbind(Experiment = sample(1:2000,50000000,replace = TRUE),Parameter = sample(1:9,50000000,replace = TRUE),
 Sequence = sample(Sequ,10,replace = TRUE) ))

DT[,Time := sample(1:60,.N,replace = TRUE)]
setkey(DT,NULL)

start1 <- Sys.time()
DT[, head(.SD,1), by = .(Sequence, Parameter, Experiment)]
end1 <- Sys.time()

start2 <- Sys.time()
DT[, .SD[1], by = .(Sequence, Parameter, Experiment)]
end2 <- Sys.time()

start3 <- Sys.time()
setkey(DT,Sequence,Parameter,Experiment)
DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
end3 <- Sys.time()
paste("Time 1 = ",t1 <- end1 - start1,sep ="")

“时间1 = 5.45559597015381”

paste("Time 2 = ",t2 <- end2 - start2,sep ="")

“时间2 = 3.54731583595276”

paste("Time 3 = ",t3 <- end3 - start3,sep ="")

“时间3 = 10.0164358615875”

答案 1 :(得分:0)

TL; DR

如果你有data.table 1.9.6使用

DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]

如果您使用的是1.9.7+,也可以使用

DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]

一些基准

低于1e6和1e7的基准。最近还引入了head(.SD, 1)的优化,测试了data.table的开发版本 我已经使用了@HywelMJ生成的数据集,但它似乎没有按照Time列的顺序反映OP数据集,因此数据是无序的。一旦OP将提供可重复的示例,我可以更新时间 由于HywelMJ中使用的排序和mult="first"不同,答案结果不同。我假设(查看OP数据的印刷品)Jaap和nicola答案是正确的。

# 1e6 - data.table 1.9.6 ----

# install.packages("data.table")
packageVersion("data.table")
#[1] ‘1.9.6’
library(data.table)
DT <- as.data.table(cbind(Experiment = round(runif(1000000,min = 1, max = 2000)),Parameter = round(runif(1000000,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order

DT = copy(DT.backup)
system.time(
    r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  8.420   0.000   8.408 

DT = copy(DT.backup)
system.time(
    r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  0.664   0.000   0.664 

DT = copy(DT.backup)
system.time({
    setkey(DT,Sequence,Parameter,Experiment)
    r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
#    user  system elapsed 
#   0.332   0.000   0.331

all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment), .(Sequence,Parameter,Experiment,Time)],
          r.join[order(Sequence,Parameter,Experiment), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"

# 1e7 - data.table 1.9.6 ----

DT <- as.data.table(cbind(Experiment = round(runif(1e7,min = 1, max = 2000)),Parameter = round(runif(1e7,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order

DT = copy(DT.backup)
system.time(
    r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  85.848   0.064  85.829

DT = copy(DT.backup)
system.time(
    r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  7.164   0.044   7.201

DT = copy(DT.backup)
system.time({
    setkey(DT,Sequence,Parameter,Experiment)
    r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
#    user  system elapsed 
#   3.440   0.080   3.516

all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
          r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"

# 1e6 - data.table 1.9.7 ----

# devtools::install_github("Rdatatable/data.table")
packageVersion("data.table")
#[1] ‘1.9.7’
library(data.table)

DT <- as.data.table(cbind(Experiment = round(runif(1000000,min = 1, max = 2000)),Parameter = round(runif(1000000,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order

DT = copy(DT.backup)
system.time(
    r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  0.236   0.008   0.242

DT = copy(DT.backup)
system.time(
    r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  0.216   0.004   0.220 

DT = copy(DT.backup)
system.time({
    setkey(DT,Sequence,Parameter,Experiment)
    r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
#    user  system elapsed 
#   0.324   0.000   0.324 

all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
          r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"

# 1e7 - data.table 1.9.7 ----

DT <- as.data.table(cbind(Experiment = round(runif(1e7,min = 1, max = 2000)),Parameter = round(runif(1e7,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order

DT = copy(DT.backup)
system.time(
    r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  2.676   0.056   2.732

DT = copy(DT.backup)
system.time(
    r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
#   user  system elapsed 
#  2.620   0.112   2.728

DT = copy(DT.backup)
system.time({
    setkey(DT,Sequence,Parameter,Experiment)
    r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
#    user  system elapsed 
#   3.636   0.084   3.714

all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
          r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"