
时间:2015-06-11 01:19:39

标签: r dplyr greatest-n-per-group


> df <- data_frame(g = c('A', 'A', 'B', 'B', 'B', 'C'), x = c(7, 3, 5, 9, 2, 4))
> df
Source: local data frame [6 x 2]

  g x
1 A 7
2 A 3
3 B 5
4 B 9
5 B 2
6 C 4


> df %>% group_by(g) %>% mutate(x_max = max(x))
Source: local data frame [6 x 3]
Groups: g

  g x x_max
1 A 7     7
2 A 3     7
3 B 5     9
4 B 9     9
5 B 2     9
6 C 4     4



Source: local data frame [6 x 3]
Groups: g

  g x x_max x_max_exclude
1 A 7     7             3
2 A 3     7             7
3 B 5     9             9
4 B 9     9             5
5 B 2     9             9
6 C 4     4            NA


> df %>% group_by(g) %>% mutate(x_max = max(x), r = row_number(), x_max_exclude = max(x[-r]))
Source: local data frame [6 x 5]
Groups: g

  g x x_max r x_max_exclude
1 A 7     7 1          -Inf
2 A 3     7 2          -Inf
3 B 5     9 1          -Inf
4 B 9     9 2          -Inf
5 B 2     9 3          -Inf
6 C 4     4 1          -Inf
Warning messages:
1: In max(c(4, 9, 2)[-1:3]) :
  no non-missing arguments to max; returning -Inf
2: In max(c(4, 9, 2)[-1:3]) :
  no non-missing arguments to max; returning -Inf
3: In max(c(4, 9, 2)[-1:3]) :
  no non-missing arguments to max; returning -Inf


4 个答案:

答案 0 :(得分:5)


df %>% 
  group_by(g) %>% 
  arrange(desc(x)) %>% 
  mutate(max = ifelse(x == max(x), x[2], max(x)))


#Source: local data frame [6 x 3]
#Groups: g
#  g x max
#1 A 7   3
#2 A 3   7
#3 B 9   5
#4 B 5   9
#5 B 2   9
#6 C 4  NA



df <- data.frame(g = sample(LETTERS, 10e5, replace = TRUE),
                 x = sample(1:10, 10e5, replace = TRUE))


mbm <- microbenchmark(
  steven = df %>% 
    group_by(g) %>% 
    arrange(desc(x)) %>% 
    mutate(max = ifelse(x == max(x), x[2], max(x))),
  eric = df %>% 
    group_by(g) %>% 
    mutate(x_max = max(x), 
           x_max2 = sort(x, decreasing = TRUE)[2], 
           x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>% 
  arun = setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g],
  times = 50

@ Arun的data.table解决方案是最快的:

# Unit: milliseconds
#    expr       min        lq      mean    median       uq      max neval cld
#  steven 158.58083 163.82669 197.28946 210.54179 212.1517 260.1448    50  b 
#    eric 223.37877 228.98313 262.01623 274.74702 277.1431 284.5170    50   c
#    arun  44.48639  46.17961  54.65824  47.74142  48.9884 102.3830    50 a   

enter image description here

答案 1 :(得分:4)


setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g]

我们的想法是按列x 订购,在这些索引上,我们按g进行分组。由于我们有序列索引,对于第一个.N-1行,最大值是.N处的值。对于.N行,它是.N-1行的值。



答案 2 :(得分:2)


df %>% 
  group_by(g) %>% 
  mutate(x_max = max(x), 
         x_max2 = sort(x, decreasing = TRUE)[2], 
         x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>% 

答案 3 :(得分:1)


df %>% group_by(g) %>% mutate(x_max_exclude = max_exclude(x))
Source: local data frame [6 x 3]
Groups: g

  g x x_max_exclude
1 A 7             3
2 A 3             7
3 B 5             9
4 B 9             5
5 B 2             9
6 C 4            NA


max_exclude <- function(v) {
  res <- c()
  for(i in seq_along(v)) {
    res[i] <- suppressWarnings(max(v[-i]))
  res <- ifelse(!is.finite(res), NA, res)

它也适用于base R

df$x_max_exclude <- with(df, ave(x, g, FUN=max_exclude))
Source: local data frame [6 x 3]

  g x x_max_exclude
1 A 7             3
2 A 3             7
3 B 5             9
4 B 9             5
5 B 2             9
6 C 4            NA



big.df <- data.frame(g=rep(LETTERS[1:4], each=1e3), x=sample(10, 4e3, replace=T))

  plafort_dplyr = big.df %>% group_by(g) %>% mutate(x_max_exclude = max_exclude(x)),
  plafort_ave = big.df$x_max_exclude <- with(big.df, ave(x, g, FUN=max_exclude)),
  StevenB = (big.df %>% 
    group_by(g) %>% 
    mutate(max = ifelse(row_number(desc(x)) == 1, x[row_number(desc(x)) == 2], max(x)))
  Eric = df %>% 
    group_by(g) %>% 
    mutate(x_max = max(x), 
           x_max2 = sort(x, decreasing = TRUE)[2], 
           x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>% 
  Arun = setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g]

Unit: milliseconds
          expr       min        lq      mean    median        uq        max neval
 plafort_dplyr 75.219042 85.207442 89.247409 88.203225 90.627663 179.553166   100
   plafort_ave 75.907798 84.604180 87.136122 86.961251 89.431884 104.884294   100
       StevenB  4.436973  4.699226  5.207548  4.931484  5.364242  11.893306   100
          Eric  7.233057  8.034092  8.921904  8.414720  9.060488  15.946281   100
          Arun  1.789097  2.037235  2.410915  2.226988  2.423638   9.326272   100