为每个类别中多个变量创建比例变量

时间:2020-05-13 11:14:58

标签: r dplyr

我要创建一个数据框,其中包含各类别中观察值所占比例的列,就像这样:

library(tidyverse)
mtcars %>%
  group_by(am) %>%
  summarise(gear3 = sum(gear == 3)/n(), 
            gear4 = sum(gear == 4)/n(), 
            gear5 = sum(gear == 5)/n(), 
            cyl4 = sum(cyl == 4)/n(),
            cyl6 = sum(cyl == 6)/n(),
            cyl8 = sum(cyl == 8)/n())

# # A tibble: 2 x 7
#      am gear3 gear4 gear5  cyl4  cyl6  cyl8
#   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1     0 0.789 0.211 0     0.158 0.211 0.632
# 2     1 0     0.615 0.385 0.615 0.231 0.154

我正在寻找无需手动命名新的摘要变量的方法吗?

似乎有一些问题,例如here,与为单个变量创建比例有关,我可以为每个变量复制比例,先进行枢轴旋转然后合并,但是在我的应用程序中将变得乏味-我正在尝试为许多变量构建数据框

mtcars %>%
  group_by(am, gear) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
# # A tibble: 4 x 4
# # Groups:   am [2]
#      am  gear     n  freq
#   <dbl> <dbl> <int> <dbl>
# 1     0     3    15 0.789
# 2     0     4     4 0.211
# 3     1     4     8 0.615
# 4     1     5     5 0.385

mtcars %>%
  group_by(am, cyl) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
# # A tibble: 6 x 4
# # Groups:   am [2]
#      am   cyl     n  freq
#   <dbl> <dbl> <int> <dbl>
# 1     0     4     3 0.158
# 2     0     6     4 0.211
# 3     0     8    12 0.632
# 4     1     4     8 0.615
# 5     1     6     3 0.231
# 6     1     8     2 0.154

2 个答案:

答案 0 :(得分:1)

这是一种解决方案:

library(dplyr)

freqPairs <- function(df, first, second){
  pairs <- as.list(data.frame(t(expand.grid(first, second))))
  res <- lapply(pairs, function(z) df %>%
                  group_by(!!sym(z[1]), !!sym(z[2])) %>%
                  summarise(n = n()) %>%
                  mutate(freq = n / sum(n)) %>% 
                  {colnames(.)[1:2] = c("Var1", "Var2"); .} %>% 
                  ungroup())
  setNames(res, unlist(lapply(pairs, paste, collapse="_vs_")))
}

bind_rows(freqPairs(mtcars, first=c("am"), second=c("cyl", "gear")), .id = "comparison")
#> # A tibble: 10 x 5
#>    comparison  Var1  Var2     n  freq
#>    <chr>      <dbl> <dbl> <int> <dbl>
#>  1 am_vs_cyl      0     4     3 0.158
#>  2 am_vs_cyl      0     6     4 0.211
#>  3 am_vs_cyl      0     8    12 0.632
#>  4 am_vs_cyl      1     4     8 0.615
#>  5 am_vs_cyl      1     6     3 0.231
#>  6 am_vs_cyl      1     8     2 0.154
#>  7 am_vs_gear     0     3    15 0.789
#>  8 am_vs_gear     0     4     4 0.211
#>  9 am_vs_gear     1     4     8 0.615
#> 10 am_vs_gear     1     5     5 0.385

reprex package(v0.3.0)于2020-05-13创建

您始终可以从Var1列中恢复Var2comparison的名称,例如通过拆分该字符串。示例:

library(data.table)
res <- bind_rows(freqPairs(mtcars, first=c("am"), second=c("cyl", "gear")), .id = "comparison")
data.table(res)[, c("Variable1", "Variable2") := tstrsplit(comparison, "_vs_")][]
#>     comparison Var1 Var2  n      freq Variable1 Variable2
#>  1:  am_vs_cyl    0    4  3 0.1578947        am       cyl
#>  2:  am_vs_cyl    0    6  4 0.2105263        am       cyl
#>  3:  am_vs_cyl    0    8 12 0.6315789        am       cyl
#>  4:  am_vs_cyl    1    4  8 0.6153846        am       cyl
#>  5:  am_vs_cyl    1    6  3 0.2307692        am       cyl
#>  6:  am_vs_cyl    1    8  2 0.1538462        am       cyl
#>  7: am_vs_gear    0    3 15 0.7894737        am      gear
#>  8: am_vs_gear    0    4  4 0.2105263        am      gear
#>  9: am_vs_gear    1    4  8 0.6153846        am      gear
#> 10: am_vs_gear    1    5  5 0.3846154        am      gear

注意: 如果您确实想要两个订单中所有可能的对,则可以使用类似以下内容的

pairs <- c(combn(colnames(mtcars), 2, simplify=FALSE),
lapply(combn(colnames(mtcars), 2, simplify=FALSE), rev))

答案 1 :(得分:1)

map()中的purrr找出一种方法

首先,该函数用于计算比例的命名向量

prop <- function(v){
  n <- match.call() %>%
    as.character() %>%
    .[2] %>%
    str_extract(pattern = "(?<=\\$)(.*)")

  table(v) %>% 
    `/`(sum(.)) %>%
    as.matrix() %>%
    t() %>%
    as_tibble() %>%
    set_names(paste0(n, colnames(.)))
}
prop(v = mtcars$gear) 
# # A tibble: 1 x 3
#   gear3 gear4 gear5
#   <dbl> <dbl> <dbl>
# 1 0.469 0.375 0.156

然后使用map()将函数应用于每个组,一次一个变量

mtcars %>%
  group_nest(am) %>%
  mutate(p_gear = map(.x = data, .f = ~prop(.x$gear)),
         p_cyl = map(.x = data, .f = ~prop(.x$cyl))) %>%
  unnest(c(p_gear, p_cyl)) %>%
  select(-data)
# # A tibble: 2 x 7
#      am  gear3 gear4  gear5  cyl4  cyl6  cyl8
#   <dbl>  <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
# 1     0  0.789 0.211 NA     0.158 0.211 0.632
# 2     1 NA     0.615  0.385 0.615 0.231 0.154

另一个示例,包括将NA替换为零

mtcars %>%
  group_nest(carb) %>%
  mutate(p_gear = map(.x = data, .f = ~prop(.x$gear)),
         p_cyl = map(.x = data, .f = ~prop(.x$cyl)),
         p_vs = map(.x = data, .f = ~prop(.x$vs))) %>%
  unnest(c(p_gear, p_cyl, p_vs)) %>%
  select(-data) %>%
  mutate_all(~ifelse(is.na(.), 0, .))
# # A tibble: 6 x 9
#    carb gear3 gear4 gear5  cyl4  cyl6  cyl8   vs1   vs0
#   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1     1 0.429 0.571   0   0.714 0.286   0     1     0  
# 2     2 0.4   0.4     0.2 0.6   0       0.4   0.5   0.5
# 3     3 1     0       0   0     0       1     0     1  
# 4     4 0.5   0.4     0.1 0     0.4     0.6   0.2   0.8
# 5     6 0     0       1   0     1       0     0     1  
# 6     8 0     0       1   0     0       1     0     1