
时间:2015-11-17 06:25:18

标签: r data.table


ex <- function(n){ # example data
  data.table(id = 1:n,
             grp = sample(LETTERS[1:3], n, replace = TRUE),
             wt = sample.int(10, n, replace = TRUE),
             x = sample.int(100, n, replace = TRUE) )[order(grp),]
(d <- ex(10))
#     id grp wt   x
#  1:  1   A 10  89
#  2:  6   A  9  71
#  3:  3   B  7  65
#  4:  7   B  3  55
#  5:  9   B  4  29
#  6: 10   B 10  15
#  7:  2   C  5  70
#  8:  4   C  6 100
#  9:  5   C  2  66
# 10:  8   C  1  60



d <- ex(10)
  subset(data.table(full_join(d, d, by='grp')), 
         id.x != id.y)[, .(grp, x = x.x, wt=wt.x, 
                           rest_of_grp_wtd_avg = sum(wt.y * x.y) / sum(wt.y)),
                       by=.(id = id.x)][order(grp, id),]
) # produces desired result
#    id grp   x wt rest_of_grp_wtd_avg
# 1:  1   A  89 10            71.00000
# 2:  6   A  71  9            89.00000
# 3:  3   B  65  7            25.35294
# 4:  7   B  55  3            34.33333
# 5:  9   B  29  4            38.50000
# 6: 10   B  15 10            52.57143
# 7:  2   C  70  5            88.00000
# 8:  4   C 100  6            67.75000
# 9:  5   C  66  2            84.16667
#10:  8   C  60  1            83.23077


另外,sqldf (编辑:现在)有效:

sqldf('select a.*, 
  sum(b.wt * b.x) / sum(b.wt) as rest_of_grp_wtd_avg
  from d as a
  left outer join d as b on a.grp = b.grp and a.id <> b.id
  group by a.id') # returns the desired solution


merge(d[CJ(d$id, id2 = d$id),][id != id2, ],
      d, by.x = c('id2','grp'), by.y=c('id','grp')
      )[order(grp, id), .(rest_of_grp_wtd_avg = sum(wt.y * x.y) / sum(wt.y)), 
        by=.(id, grp, wt=wt.x, x=x.x)] # returns desired result


2 个答案:

答案 0 :(得分:3)

我认为你过分复杂了。使用您的公式rest_of_grp_wtd_avg = (sum(wt*x)-wt*x) / (sum(wt)-wt)可以很好地添加一个新变量以及每组其他观察值的平均值。您只需要通过d运算符引用将其添加到:=。对于纯`data.table解决方案,您可以将代码缩短为:

d[, rest_of_grp_wtd_avg := (sum(wt*x)-wt*x) / (sum(wt)-wt), grp]


> d
    id grp wt   x rest_of_grp_wtd_avg
 1:  1   A 10  89            71.00000
 2:  6   A  9  71            89.00000
 3:  3   B  7  65            25.35294
 4:  7   B  3  55            34.33333
 5:  9   B  4  29            38.50000
 6: 10   B 10  15            52.57143
 7:  2   C  5  70            88.00000
 8:  4   C  6 100            67.75000
 9:  5   C  2  66            84.16667
10:  8   C  1  60            83.23077


> all.equal(d, res)
[1] TRUE



res <- merge(d[CJ(d$id, id2 = d$id),][id != id2, ],
             d, by.x = c('id2','grp'), by.y=c('id','grp'))[order(grp, id), .(rest_of_grp_wtd_avg = sum(wt.y * x.y) / sum(wt.y)), 
                                                           by=.(id, grp, wt=wt.x, x=x.x)]


d[id < 9, rest_of_grp_wtd_avg := (sum(wt*x)-wt*x) / (sum(wt)-wt), grp]


> d
    id grp wt   x rest_of_grp_wtd_avg
 1:  1   A 10  89            71.00000
 2:  6   A  9  71            89.00000
 3:  3   B  7  65            55.00000
 4:  7   B  3  55            65.00000
 5:  9   B  4  29                  NA
 6: 10   B 10  15                  NA
 7:  2   C  5  70            88.00000
 8:  4   C  6 100            67.75000
 9:  5   C  2  66            84.16667
10:  8   C  1  60            83.23077

答案 1 :(得分:1)

不需要自我加入。使用dplyr的window functions功能,您可以非常轻松地计算每组的度量:

ex(10) %>%
    group_by(grp) %>%
    mutate(rest_of_grp_wtd_avg = (sum(wt*x)-wt*x) / (sum(wt)-wt))