在R中分组后选择聚合列的第n个值

时间:2016-10-18 15:22:43

标签: r dataframe group-by aggregate

给出df如下:

  # group value
# 1     A     8
# 2     A     1
# 3     A     7
# 4     B     3
# 5     B     2
# 6     B     6
# 7     C     4
# 8     C     5

df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L, 
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA, 
-8L))

索引的向量(可能带有NA):

inds <- c(2,1,NA)

我们如何才能获得每个组value列的第n个元素,最好是基数R

例如,根据inds,我们希望组value中的A的第二个元素,组B中的第一个元素,组中的NA C。结果将是:

#[1] 1 3 NA

5 个答案:

答案 0 :(得分:5)

以下是mapplysplit的解决方案:

mapply("[", with(df, split(value, group)), inds)

返回一个命名向量

 A  B  C 
 1  3 NA

with(df, split(value, group))按组拆分数据并返回数据帧列表。 mapply获取该列表和&#34; inds&#34;并应用子集函数&#34; [&#34;每对论点。

答案 1 :(得分:2)

使用levelssapply即可:

DF <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L, 
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA, 
-8L))


inds <- c(2,1,NA)

lvls = levels(DF$group)

groupInds = sapply(1:length(lvls),function(x) DF$value[DF$group==lvls[x]][inds[x]]  )

groupInds
#[1]  1  3 NA

答案 2 :(得分:1)

再次使用mapply(但不像IMO的回答那么优雅):

 mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds)

答案 3 :(得分:1)

我知道你在基地R中说最好是,但只是为了记录,这里是一个data.table方式

setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
#[1]  1  3 NA

答案 4 :(得分:0)

我刚刚提出了另一个解决方案:

diag(aggregate(value~group, df, function(x) x[inds])[,-1])
#[1]  1  3 NA

<强>基准

library(microbenchmark)
library(data.table)
df <- structure(list(group = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 
3L), .Label = c("A", "B", "C"), class = "factor"), value = c(8L, 
1L, 7L, 3L, 2L, 6L, 4L, 5L)), .Names = c("group", "value"), class = "data.frame", row.names = c(NA, 
-8L))
inds <- c(2,1,NA)

f_Imo <- function(df) as.vector(mapply("[", with(df, split(value, group)), inds))
f_Osssan <- function(df) {lvls = levels(df$group);sapply(1:length(lvls),function(x) df$value[df$group==lvls[x]][inds[x]])}
f_User2321 <- function(df)  unlist(mapply(function(x, y) subset(df, group == x, value)[y,] ,levels(df$group), inds))
f_dww <- function(df) setDT(df)[, .SD[inds[.GRP], value], by=group][,V1]
f_m0h3n <- function(df) diag(aggregate(value~group, df, function(x) x[inds])[,-1])

all.equal(f_Imo(df), f_Osssan(df), f_User2321(df), f_dww(df), f_m0h3n(df))
# [1] TRUE

microbenchmark(f_Imo(df), f_Osssan(df), f_m0h3n(df), f_User2321(df), f_dww(df))

# Unit: microseconds
           # expr      min        lq       mean   median        uq      max neval
      # f_Imo(df)   71.004   85.1180   91.52996   91.748   96.8810  121.048   100
   # f_Osssan(df)  252.788  276.5265  318.70529  287.648  301.5495 2651.492   100
    # f_m0h3n(df) 1422.627 1555.4365 1643.47184 1618.740 1670.7095 4729.827   100
 # f_User2321(df) 2889.738 3000.3055 3148.44916 3037.945 3118.7860 6013.442   100
      # f_dww(df) 2960.740 3086.2790 3206.02147 3143.381 3250.9545 5976.229   100