将top_n函数映射到分组数据

时间:2019-06-17 11:24:45

标签: r dplyr tidyr

假设我有一个数据框:

value = c(1:5,16:20, 26:30)
group = c(rep("A", 5), rep("B", 5), rep("C", 5))
df = data.frame(value, group)

我想创建一个新的数据框,其中包含每个组的top_n值,这样A组的n = 3,B组的n = 2,C组的n = 1。

# new dataframe should look like this:

  value group
1     5     A
2     4     A
3     3     A
4    20     B
5    19     B
6    30     C

我认为我应该将top_n函数映射到我的数据,但是我正在努力寻找正确的实现。

5 个答案:

答案 0 :(得分:6)

您可以在tail通话中使用Map

do.call(rbind, Map(tail, split(df, df$group), 3:1))
#      value group
# A.3      3     A
# A.4      4     A
# A.5      5     A
# B.9     19     B
# B.10    20     B
# C       30     C

注意:如果数据排序不如给定示例中那样好,则请事先排序。 df <- with(df, df[order(group, value), ])

数据

df <- structure(list(value = c(1L, 2L, 3L, 4L, 5L, 16L, 17L, 18L, 19L, 
20L, 26L, 27L, 28L, 29L, 30L), group = structure(c(1L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("A", 
"B", "C"), class = "factor")), class = "data.frame", row.names = c(NA, 
-15L))

答案 1 :(得分:3)

我希望在数据框中添加n,然后再添加arrangeslice

library(dplyr)

df %>%
   mutate(n = case_when(group == "A"~3L, 
                        group == "B"~ 2L, 
                        TRUE ~ 1L)) %>%
   arrange(group, desc(value)) %>%
   group_by(group) %>%
   slice(seq_len(n[1L])) %>%
   select(-n)


#  value group
#  <int> <fct>
#1     5 A    
#2     4 A    
#3     3 A    
#4    20 B    
#5    19 B    
#6    30 C    

答案 2 :(得分:3)

使用maptop_n

library(tidyverse)
df %>% nest(-group) %>% 
       mutate(dt = map(data, ~top_n(.x, n=.x$n[1], wt=value))) %>% 
       unnest(dt)

#Using map_df
map_df(df %>% group_split(group), ~top_n(.x, n=.x$n[1], wt=value))

# A tibble: 6 x 3
  value group     n
  <int> <chr> <dbl>
  1     3 A         3
  2     4 A         3
  3     5 A         3
  4    19 B         2
  5    20 B         2
  6    30 C         1

数据

value = c(1:5,16:20, 26:30)
group = c(rep("A", 5), rep("B", 5), rep("C", 5))
n = c(rep(3, 5), rep(2, 5), rep(1, 5))
df = data.frame(value, group,n,stringsAsFactors = FALSE)

答案 3 :(得分:2)

这里是{dplyr}> = 0.8&{purrr}的实现:

value = c(1:5,16:20, 26:30)
group = c(rep("A", 5), rep("B", 5), rep("C", 5))
df = data.frame(value, group)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(purrr)
df %>% 
  group_by(group) %>%
  group_split() %>%
  map2_df(., length(.):1, ~ top_n(.x, .y, value) %>% arrange(desc(value)))
#> # A tibble: 6 x 2
#>   value group
#>   <int> <fct>
#> 1     5 A    
#> 2     4 A    
#> 3     3 A    
#> 4    20 B    
#> 5    19 B    
#> 6    30 C

请注意,top_n不会对数据进行排序,因此您必须组合top_n()arrange()

R基的另一个建议:

x <- df %>%
  split(df$group)
mapply(function(x, y){
  top_n(x, y, value)
}, x = x, y = length(x):1, SIMPLIFY = FALSE) %>%
  do.call(rbind, .)
    value group
A.1     3     A
A.2     4     A
A.3     5     A
B.1    19     B
B.2    20     B
C      30     C

答案 4 :(得分:2)

您可以在底数为R的情况下使用单线。我认为在这里使用dplyr可能会更复杂。

#split the df on group and then subset each group
mylist <- Map(function(x, y) x[order(x$value, decreasing = TRUE)[1:y], ], split(df, group), 3:1)
do.call(rbind, mylist)

#  value group
#1     5     A
#2     4     A
#3     3     A
#4    20     B
#5    19     B
#6    30     C

由于您已经在使用dplyr,因此也可以使用bind_rows

bind_rows(Map(function(x, y) x[order(x$value, decreasing = TRUE)[1:y], ], split(df, group), 3:1))