我有一个包含类似数据的表
Samp depth value
A1 0 2
A1 1 4
A1 2 3
A1 3 6
A1 4 8
A1 5 6
A1 6 2
A1 7 3
A2 0 2
A2 1 8
A2 2 6
A2 3 3
A2 4 6
A2 5 6
A3 0 7
A3 1 3
A3 2 2
A3 3 8
A3 4 3
...
带间隔的第二个表
Samp d_top d_bot
A1 0 2
A2 0 5
A3 1 2
A4 3 5
...
现在我想使用第二个表中的间隔来查询第一个表。
Samp d_int sum_value
A1 0-2 9
A2 0-5 29
A3 1-2 5
...
它应该使用aggregate或ddply,通过指定列表,我尝试过。问题是它不是固定的间隔,而是如第二个表中所定义的那样。任何帮助表示赞赏。
答案 0 :(得分:3)
这个'由其他表组'可以在sqldf包的帮助下用SQL表示。
您的表格在下面的代码中是x,范围表格是y。诀窍是双连接条件。 join条件中的第一个(在Samp上)和where条件中隐含的一个(非equi连接)。
library(sqldf)
sqldf('
SELECT
x.Samp,
y.d_top || "-" || y.d_bot as d_int,
sum(x.value) as sum_value
FROM x
JOIN y on y.Samp = x.Samp
WHERE
y.d_top <= x.depth and
x.depth <= y.d_bot
GROUP BY
y.d_top, y.d_bot, x.Samp
')
哪个收益
Samp d_int sum_value
1 A1 0-2 9
2 A2 0-5 31
3 A3 1-2 5
答案 1 :(得分:2)
这是一种方法:
do.call(rbind, by(dat1, dat1$Samp, function(x) {
Samp <- as.character(x$Samp[1])
idx <- Samp == as.character(dat2$Samp)
sequ <- seq(dat2$d_top[idx], dat2$d_bot[idx])
idx2 <- x$depth %in% sequ
data.frame(Samp, d_int = paste(range(sequ), collapse = "-"),
sum_value = sum(x$value[idx2]))
}))
其中dat1
是您较大的数据框,dat2
是您较短的数据框。
返回:
Samp d_int sum_value
A1 A1 0-2 9
A2 A2 0-5 31
A3 A3 1-2 5
答案 2 :(得分:2)
另一个想法:
f = function(samp, dt, db) {
inds = DF1$Samp == samp
sum(DF1[inds,'value'][DF1[inds,'depth'] %in% `:`(dt, db)])
} #DF1 and DF2 are your large and small dataframes, respectively
data.frame(Samp = DF2$Samp,
d_int = paste(DF2$d_top, DF2$d_bot, sep = " - "),
sum_value = mapply(f, DF2$Samp, DF2$d_top, DF2$d_bot, USE.NAMES = F))
# Samp d_int sum_value
#1 A1 0 - 2 9
#2 A2 0 - 5 31
#3 A3 1 - 2 5
基准测试:
set.seed(11)
DF1 = data.frame(Samp = rep(letters, each = 20),
depth = sample(1:10, 26*20, T),
value = runif(26*20),
stringsAsFactors = F)
set.seed(11)
DF2 = data.frame(Samp = letters,
d_top = sample(1:5, 26, T),
d_bot = sample(3:10, 26, T),
stringsAsFactors = F)
dat1 = DF1; dat2 = DF2; x = DF1; y = DF2
#> head(alex())
# Samp d_int sum_value
#1 a 2 - 6 5.127813
#2 b 1 - 3 4.043807
#3 c 3 - 4 3.356880
#4 d 1 - 6 9.209616
#5 e 1 - 7 7.452329
#6 f 5 - 5 2.241515
#> head(sven())
# Samp d_int sum_value
#a a 2-6 5.127813
#b b 1-3 4.043807
#c c 3-4 3.356880
#d d 1-6 9.209616
#e e 1-7 7.452329
#f f 5-5 2.241515
#> head(rick()[order(rick()[,1]),])
# Samp d_int sum_value
#10 a 2-6 5.127813
#1 b 1-3 4.043807
#16 c 3-4 3.356880
#4 d 1-6 9.209616
#6 e 1-7 7.452329
#22 f 5-5 2.241515
#> microbenchmark(alex(), sven(), rick())
#Unit: milliseconds
# expr min lq median uq max neval
# alex() 3.10070 3.230853 3.306196 3.461753 4.269292 100
# sven() 24.33163 25.525797 26.184391 26.868042 63.197223 100
# rick() 17.89463 18.622127 19.182584 19.820124 23.278920 100