我对该主题有一个简短的跟进问题: R subsetting dataframe based on the combination of 3 columns and excluding duplicate combinations
我有这样的数据集:
Experiment Sequence Parameter Time
Exp1 JJJJ 2 10 *
Exp1 JJJJ 2 11 *
Exp1 JJJJ 2 12 *
Exp2 JJJJ 2 13
Exp3 JJJJ 2 15
Exp1 ZZZZ 3 12
Exp2 ZZZZ 3 23 *
Exp2 ZZZZ 3 23.5 *
Exp2 ZZZZ 3 24 *
Exp3 ZZZZ 3 15
.....
现在,对于每个序列,我只想保留第一个独特的Experiment + Sequence + Parameter组合。 实际上,减少标有*的行,最终得到如下数据集:
Experiment Sequence Parameter Time
Exp1 JJJJ 2 10 *
Exp2 JJJJ 2 13
Exp3 JJJJ 2 15
Exp1 ZZZZ 3 12
Exp2 ZZZZ 3 23 *
Exp3 ZZZZ 3 15
.....
我想使用data.table包,这真是太棒了,我想出了这个解决方案。这需要相当长的时间,所以我想知道是否有更好/最快的方式/语法。
keycols = c("Sequence","Parameter","Experiment")
setkeyv(DT,keycols)
DT <- DT[,head(.SD,1), by = key(DT)]
答案 0 :(得分:0)
下面将上述两条评论中提出的两种方法的速度与以下方法进行比较。
setkey(DT,Sequence,Parameter,Experiment)
DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
首先,我生成一个包含5000万行的data.table。
Sequ <- c("AAAA","BBBB","CCCC","DDDD","EEEE","FFFF","GGGG","HHHH","IIII","JJJJ")
DT <- as.data.table(cbind(Experiment = sample(1:2000,50000000,replace = TRUE),Parameter = sample(1:9,50000000,replace = TRUE),
Sequence = sample(Sequ,10,replace = TRUE) ))
DT[,Time := sample(1:60,.N,replace = TRUE)]
setkey(DT,NULL)
start1 <- Sys.time()
DT[, head(.SD,1), by = .(Sequence, Parameter, Experiment)]
end1 <- Sys.time()
start2 <- Sys.time()
DT[, .SD[1], by = .(Sequence, Parameter, Experiment)]
end2 <- Sys.time()
start3 <- Sys.time()
setkey(DT,Sequence,Parameter,Experiment)
DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
end3 <- Sys.time()
paste("Time 1 = ",t1 <- end1 - start1,sep ="")
“时间1 = 5.45559597015381”
paste("Time 2 = ",t2 <- end2 - start2,sep ="")
“时间2 = 3.54731583595276”
paste("Time 3 = ",t3 <- end3 - start3,sep ="")
“时间3 = 10.0164358615875”
答案 1 :(得分:0)
TL; DR
如果你有data.table 1.9.6使用
DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
如果您使用的是1.9.7+,也可以使用
DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
一些基准
低于1e6和1e7的基准。最近还引入了head(.SD, 1)
的优化,测试了data.table的开发版本
我已经使用了@HywelMJ生成的数据集,但它似乎没有按照Time
列的顺序反映OP数据集,因此数据是无序的。一旦OP将提供可重复的示例,我可以更新时间
由于HywelMJ中使用的排序和mult="first"
不同,答案结果不同。我假设(查看OP数据的印刷品)Jaap和nicola答案是正确的。
# 1e6 - data.table 1.9.6 ----
# install.packages("data.table")
packageVersion("data.table")
#[1] ‘1.9.6’
library(data.table)
DT <- as.data.table(cbind(Experiment = round(runif(1000000,min = 1, max = 2000)),Parameter = round(runif(1000000,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 8.420 0.000 8.408
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 0.664 0.000 0.664
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 0.332 0.000 0.331
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"
# 1e7 - data.table 1.9.6 ----
DT <- as.data.table(cbind(Experiment = round(runif(1e7,min = 1, max = 2000)),Parameter = round(runif(1e7,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 85.848 0.064 85.829
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 7.164 0.044 7.201
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 3.440 0.080 3.516
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"
# 1e6 - data.table 1.9.7 ----
# devtools::install_github("Rdatatable/data.table")
packageVersion("data.table")
#[1] ‘1.9.7’
library(data.table)
DT <- as.data.table(cbind(Experiment = round(runif(1000000,min = 1, max = 2000)),Parameter = round(runif(1000000,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 0.236 0.008 0.242
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 0.216 0.004 0.220
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 0.324 0.000 0.324
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"
# 1e7 - data.table 1.9.7 ----
DT <- as.data.table(cbind(Experiment = round(runif(1e7,min = 1, max = 2000)),Parameter = round(runif(1e7,min = 1,max = 9))))
DT[,Sequence:= seq_len(.N),keyby = Experiment]
DT[,Time := sample(1:60,.N,replace = TRUE)]
DT.backup = DT[sample(nrow(DT))] # ensure random order
DT = copy(DT.backup)
system.time(
r.head <- DT[, head(.SD,1L), by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 2.676 0.056 2.732
DT = copy(DT.backup)
system.time(
r.sd1 <- DT[, .SD[1L], by = .(Sequence, Parameter, Experiment)]
)
# user system elapsed
# 2.620 0.112 2.728
DT = copy(DT.backup)
system.time({
setkey(DT,Sequence,Parameter,Experiment)
r.join <- DT[unique(DT[,.(Experiment,Sequence,Parameter)]),mult = "first"]
})
# user system elapsed
# 3.636 0.084 3.714
all.equal(r.head, r.sd1)
#[1] TRUE
all.equal(r.head[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)],
r.join[order(Sequence,Parameter,Experiment,Time), .(Sequence,Parameter,Experiment,Time)])
#[1] "Attributes: < Length mismatch: comparison on first 1 components >"