如何创建bin频率表,其中bin大小随组而异

时间:2018-08-10 11:36:26

标签: r group-by data.table aggregate frequency

我正在尝试创建一个bin频率表,其中有多个分组列,但是更重要的是,bin的大小随分组列之一而变化。让我举例说明:

set.seed(42)
ID <- as.factor(c(rep("A",20),rep("B",22)))
date <- as.factor(c(rep("C",12),rep("D",8),rep("E",10),rep("F",12)))
group <- as.factor(c(rep("G",6),rep("H",6),rep("G",8),rep("G",6),rep("H",4),rep("G",6),rep("H",6)))
val <- round(rnorm(42,20,10),0)

df <- data.frame(ID,date,group,val)

使用我在this帖子中编辑的一些代码,可以生成按I​​D,日期和组的val频率表:

br <- c(0,10,30,100)
frqtab <- aggregate(val~ID+date+group,df,FUN=function(x) table(cut(x, br)))

但是,我想为组内的每个因素设置不同的分档大小,例如G组可以保留brG <- c(0,10,30,100),而H组可以保留brH <- c(0,10,50,100)。我猜想,我可以编写一些ifelse函数,但这会很混乱,特别是因为我的真实数据有很多组。任何帮助将不胜感激!

2 个答案:

答案 0 :(得分:2)

这是一个可能的解决方案:

# example data
set.seed(42)
ID <- as.factor(c(rep("A",20),rep("B",22)))
date <- as.factor(c(rep("C",12),rep("D",8),rep("E",10),rep("F",12)))
group <- as.factor(c(rep("G",6),rep("H",6),rep("G",8),rep("G",6),rep("H",4),rep("G",6),rep("H",6)))
val <- round(rnorm(42,20,10),0)

df <- data.frame(ID,date,group,val)

# using the function you provided
f = function(br, df) {aggregate(val~ID+date+group,df,FUN=function(x) table(cut(x, br)))}

library(tidyverse)

# create a look up table
# (specify the breaks for each group)
look_up = data_frame(group_id = c("G","H"),
                     br = list(c(0,10,30,100), c(0,10,50,100)))

df_upd = df %>%
  group_by(group_id = group) %>%          # duplicate group column and group by it
  nest() %>%                              # nest data
  left_join(look_up, by="group_id") %>%   # join look up table to get corresponding breaks
  mutate(d = map2(br, data, ~f(.x, .y)))  # apply function

# see results
df_upd$d

# [[1]]
#   ID date group val.(0,10] val.(10,30] val.(30,100]
# 1  A    C     G          0           5            1
# 2  A    D     G          1           4            1
# 3  B    E     G          1           3            2
# 4  B    F     G          1           5            0
# 
# [[2]]
#   ID date group val.(0,10] val.(10,50] val.(50,100]
# 1  A    C     H          0           6            0
# 2  B    E     H          1           3            0
# 3  B    F     H          0           5            0

我决定使用您提供的功能,其中显然包括对列名称的分隔。因此,当您对不同的组使用不同的休息时间时,输出将不能包含在一个数据框中,因为这会导致列名冲突。

在一个数据帧中获取所有内容的唯一方法是更改​​函数以产生更“整洁”的输出:

library(tidyverse)

# updated function
f = function(br, df) {
  df %>%
  mutate(g = cut(val, br)) %>%
  na.omit() %>%
  count(g, ID, date, group) %>%
  complete(g, nesting(ID, date, group), fill=list(n=0)) }

# same lookup table
look_up = data_frame(group_id = c("G","H"),
                     br = list(c(0,10,30,100), c(0,10,50,100)))

# apply your function
df %>%
  group_by(group_id = group) %>%          
  nest() %>%                              
  left_join(look_up, by="group_id") %>%   
  mutate(d = map2(br, data, ~f(.x, .y))) %>%
  unnest(d) %>%
  select(-group_id) %>%
  arrange(group, date, ID)   # for visualisation purposes only

# # A tibble: 21 x 5
#   g        ID    date  group     n
#   <chr>    <fct> <fct> <fct> <dbl>
# 1 (0,10]   A     C     G         0
# 2 (10,30]  A     C     G         5
# 3 (30,100] A     C     G         1
# 4 (0,10]   A     D     G         1
# 5 (10,30]  A     D     G         4
# 6 (30,100] A     D     G         1
# 7 (0,10]   B     E     G         1
# 8 (10,30]  B     E     G         3
# 9 (30,100] B     E     G         2
# 10 (0,10]  B     F     G         1
# # ... with 11 more rows

答案 1 :(得分:1)

Antonios K的答案的“整洁”部分的data.table版本:

df[, data.table(table(bin = cut(val, 
  breaks = c(0, 10, if (group == "G") 30 else 50, 100)
))), by=.(ID, date, group)]

    ID date group      bin N
 1:  A    C     G   (0,10] 0
 2:  A    C     G  (10,30] 5
 3:  A    C     G (30,100] 1
 4:  A    C     H   (0,10] 0
 5:  A    C     H  (10,50] 6
 6:  A    C     H (50,100] 0
 7:  A    D     G   (0,10] 1
 8:  A    D     G  (10,30] 4
 9:  A    D     G (30,100] 1
10:  B    E     G   (0,10] 1
11:  B    E     G  (10,30] 3
12:  B    E     G (30,100] 2
13:  B    E     H   (0,10] 1
14:  B    E     H  (10,50] 3
15:  B    E     H (50,100] 0
16:  B    F     G   (0,10] 1
17:  B    F     G  (10,30] 5
18:  B    F     G (30,100] 0
19:  B    F     H   (0,10] 0
20:  B    F     H  (10,50] 5
21:  B    F     H (50,100] 0
    ID date group      bin N

或编写一个辅助函数和一个辅助表:

library(magrittr)
cut_tab = function(x, br) x %>% cut(br) %>% table(bin = . ) %>% data.table

cutDT = data.table(key="group",
  group = c("G", "H"), 
  br = list(c(0, 10, 30, 100), c(0, 10, 50, 100)))

df[, cut_tab(val, br = cutDT[.BY, on=key(cutDT), unlist(x.br)]), by=.(ID, date, group)]