R使用来自第二个数据帧的条件聚合data.frame

时间:2014-01-04 12:50:02

标签: r dataframe aggregate-functions

我有一个包含类似数据的表

Samp  depth   value
A1     0     2
A1     1     4
A1     2     3
A1     3     6
A1     4     8
A1     5     6
A1     6     2
A1     7     3
A2     0     2
A2     1     8
A2     2     6
A2     3     3
A2     4     6
A2     5     6
A3     0     7
A3     1     3
A3     2     2
A3     3     8
A3     4     3
...

带间隔的第二个表

Samp d_top d_bot
A1   0   2
A2   0   5
A3   1   2
A4   3   5
...

现在我想使用第二个表中的间隔来查询第一个表。

Samp  d_int sum_value
A1    0-2    9
A2    0-5   29
A3    1-2    5
...

它应该使用aggregate或ddply,通过指定列表,我尝试过。问题是它不是固定的间隔,而是如第二个表中所定义的那样。任何帮助表示赞赏。

3 个答案:

答案 0 :(得分:3)

这个'由其他表组'可以在sqldf包的帮助下用SQL表示。

您的表格在下面的代码中是x,范围表格是y。诀窍是双连接条件。 join条件中的第一个(在Samp上)和where条件中隐含的一个(非equi连接)。

library(sqldf)
sqldf('
  SELECT 
    x.Samp,
    y.d_top || "-" || y.d_bot as d_int,
    sum(x.value) as sum_value
  FROM x
  JOIN y on y.Samp = x.Samp 
  WHERE 
    y.d_top <= x.depth and
    x.depth <= y.d_bot 
  GROUP BY 
    y.d_top, y.d_bot, x.Samp
  ')

哪个收益

    Samp d_int sum_value
1   A1   0-2         9
2   A2   0-5        31
3   A3   1-2         5

答案 1 :(得分:2)

这是一种方法:

do.call(rbind, by(dat1, dat1$Samp, function(x) {
  Samp <- as.character(x$Samp[1])
  idx <- Samp == as.character(dat2$Samp)
  sequ <- seq(dat2$d_top[idx], dat2$d_bot[idx])
  idx2 <- x$depth %in% sequ
  data.frame(Samp, d_int = paste(range(sequ), collapse = "-"),
             sum_value = sum(x$value[idx2]))
}))

其中dat1是您较大的数据框,dat2是您较短的数据框。

返回:

   Samp d_int sum_value
A1   A1   0-2         9
A2   A2   0-5        31
A3   A3   1-2         5

答案 2 :(得分:2)

另一个想法:

f = function(samp, dt, db) {
          inds = DF1$Samp == samp
         sum(DF1[inds,'value'][DF1[inds,'depth'] %in% `:`(dt, db)])
    }       #DF1 and DF2 are your large and small dataframes, respectively

data.frame(Samp = DF2$Samp, 
           d_int = paste(DF2$d_top, DF2$d_bot, sep = " - "), 
           sum_value = mapply(f, DF2$Samp, DF2$d_top, DF2$d_bot, USE.NAMES = F))
#  Samp d_int sum_value
#1   A1 0 - 2         9
#2   A2 0 - 5        31
#3   A3 1 - 2         5

基准测试:

set.seed(11)
DF1 = data.frame(Samp = rep(letters, each = 20), 
                 depth = sample(1:10, 26*20, T), 
                 value = runif(26*20), 
                          stringsAsFactors = F)
set.seed(11)
DF2 = data.frame(Samp = letters, 
                 d_top = sample(1:5, 26, T), 
                 d_bot = sample(3:10, 26, T), 
                            stringsAsFactors = F)
dat1 = DF1; dat2 = DF2; x = DF1; y = DF2
#> head(alex())
#  Samp d_int sum_value
#1    a 2 - 6  5.127813
#2    b 1 - 3  4.043807
#3    c 3 - 4  3.356880
#4    d 1 - 6  9.209616
#5    e 1 - 7  7.452329
#6    f 5 - 5  2.241515
#> head(sven())
#  Samp d_int sum_value
#a    a   2-6  5.127813
#b    b   1-3  4.043807
#c    c   3-4  3.356880
#d    d   1-6  9.209616
#e    e   1-7  7.452329
#f    f   5-5  2.241515
#> head(rick()[order(rick()[,1]),])
#   Samp d_int sum_value
#10    a   2-6  5.127813
#1     b   1-3  4.043807
#16    c   3-4  3.356880
#4     d   1-6  9.209616
#6     e   1-7  7.452329
#22    f   5-5  2.241515

#> microbenchmark(alex(), sven(), rick())
#Unit: milliseconds
#   expr      min        lq    median        uq       max neval
# alex()  3.10070  3.230853  3.306196  3.461753  4.269292   100
# sven() 24.33163 25.525797 26.184391 26.868042 63.197223   100
# rick() 17.89463 18.622127 19.182584 19.820124 23.278920   100