按组嵌套数据框,但在每个组中包括额外的行

时间:2018-08-08 11:53:07

标签: r dplyr tidyverse

使用菱形数据集,我试图运行回归模型,以比较颜色“ D”与每个切割级别内的其他每种颜色(在回归模型中未指定交互作用)。

为此,我尝试创建按cut和color分组的嵌套数据框,但是在每个组中,我都希望使用适当cut的参考颜色“ D”。

以下代码无法实现我想要的功能,因为每组颜色都不包含颜色“ D”:

    library(tidyverse)

> diamonds %>% 
+     group_by(cut, color) %>% 
+     nest() %>% arrange(cut, color)
# A tibble: 35 x 3
   cut   color data              
   <ord> <ord> <list>            
 1 Fair  D     <tibble [163 x 8]>
 2 Fair  E     <tibble [224 x 8]>
 3 Fair  F     <tibble [312 x 8]>
 4 Fair  G     <tibble [314 x 8]>
 5 Fair  H     <tibble [303 x 8]>
 6 Fair  I     <tibble [175 x 8]>
 7 Fair  J     <tibble [119 x 8]>
 8 Good  D     <tibble [662 x 8]>
 9 Good  E     <tibble [933 x 8]>
10 Good  F     <tibble [909 x 8]>
# ... with 25 more rows

下面的代码可以完成工作,但是我正在寻找一个tidyverse版本:

data_fair_de          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "E"))           %>%    mutate( grouping_var  =  "data_fair_de" )    
data_fair_df          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "F"))           %>%    mutate( grouping_var  =  "data_fair_df" )    
data_fair_dg          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "G"))           %>%    mutate( grouping_var  =  "data_fair_dg" )    
data_fair_dh          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "H"))           %>%    mutate( grouping_var  =  "data_fair_dh" )    
data_fair_di          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "I"))           %>%    mutate( grouping_var  =  "data_fair_di" )    
data_fair_dj          =   diamonds %>% filter(cut=="Fair" & color %in% c("D", "J"))           %>%    mutate( grouping_var  =  "data_fair_dj" )    
data_good_de          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "E"))           %>%    mutate( grouping_var  =  "data_good_de "    )    
data_good_df          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "F"))           %>%    mutate( grouping_var  =  "data_good_df "    )    
data_good_dg          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "G"))           %>%    mutate( grouping_var  =  "data_good_dg "    )    
data_good_dh          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "H"))           %>%    mutate( grouping_var  =  "data_good_dh "    )    
data_good_di          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "I"))           %>%    mutate( grouping_var  =  "data_good_di "    )    
data_good_dj          =   diamonds %>% filter(cut=="Good" & color %in% c("D", "J"))           %>%    mutate( grouping_var  =  "data_good_dj "    )    
data_very_de          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "E"))      %>%    mutate( grouping_var  =  "data_very_de "    )    
data_very_df          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "F"))      %>%    mutate( grouping_var  =  "data_very_df "    )    
data_very_dg          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "G"))      %>%    mutate( grouping_var  =  "data_very_dg "    )    
data_very_dh          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "H"))      %>%    mutate( grouping_var  =  "data_very_dh "    )    
data_very_di          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "I"))      %>%    mutate( grouping_var  =  "data_very_di "    )    
data_very_dj          =   diamonds %>% filter(cut=="Very Good" & color %in% c("D", "J"))      %>%    mutate( grouping_var  =  "data_very_dj "    )    
data_premium_de       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "E"))        %>%    mutate( grouping_var  =  "data_premium_de " )    
data_premium_df       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "F"))        %>%    mutate( grouping_var  =  "data_premium_df " )    
data_premium_dg       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "G"))        %>%    mutate( grouping_var  =  "data_premium_dg " )    
data_premium_dh       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "H"))        %>%    mutate( grouping_var  =  "data_premium_dh " )    
data_premium_di       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "I"))        %>%    mutate( grouping_var  =  "data_premium_di " )    
data_premium_dj       =   diamonds %>% filter(cut=="Premium" & color %in% c("D", "J"))        %>%    mutate( grouping_var  =  "data_premium_dj " )    
data_ideal_de         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "E"))          %>%    mutate( grouping_var  =  "data_ideal_de "   )    
data_ideal_df         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "F"))          %>%    mutate( grouping_var  =  "data_ideal_df "   )    
data_ideal_dg         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "G"))          %>%    mutate( grouping_var  =  "data_ideal_dg "   )    
data_ideal_dh         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "H"))          %>%    mutate( grouping_var  =  "data_ideal_dh "   )    
data_ideal_di         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "I"))          %>%    mutate( grouping_var  =  "data_ideal_di "   )    
data_ideal_dj         =   diamonds %>% filter(cut=="Ideal" & color %in% c("D", "J"))          %>%    mutate( grouping_var  =  "data_ideal_dj "   )    

bind_rows( 
 data_fair_de   ,  data_fair_df   ,  data_fair_dg   ,  data_fair_dh   ,  data_fair_di   ,  data_fair_dj   , 
 data_good_de   ,  data_good_df   ,  data_good_dg   ,  data_good_dh   ,  data_good_di   ,  data_good_dj   , 
 data_very_de   ,  data_very_df   ,  data_very_dg   ,  data_very_dh   ,  data_very_di   ,  data_very_dj   , 
 data_premium_de,  data_premium_df,  data_premium_dg,  data_premium_dh,  data_premium_di,  data_premium_dj, 
 data_ideal_de  ,  data_ideal_df  ,  data_ideal_dg  ,  data_ideal_dh  ,  data_ideal_di  ,  data_ideal_dj  ) %>% 
    group_by(grouping_var) %>% 
    nest()

3 个答案:

答案 0 :(得分:1)

library(tidyverse)

# function to get data based on your filter
f = function(xx,yy,zz) {diamonds %>% filter(cut==xx & color %in% c(yy,zz))}

expand(diamonds, cut, color, color_D="D") %>%              # create all combinations of interest
  mutate_all(as.character) %>%                             # update to character variables
  rowwise() %>%                                            # for each row
  mutate(data = list(f(cut, color, color_D))) %>%          # apply your function
  ungroup() %>%                                            # forget the grouping
  filter(color != color_D)                                 # exclude cases where pair of colours is {D,D}

# # A tibble: 30 x 4
#     cut   color color_D data                 
#     <chr> <chr> <chr>   <list>               
#   1 Fair  E     D       <tibble [387 x 10]>  
#   2 Fair  F     D       <tibble [475 x 10]>  
#   3 Fair  G     D       <tibble [477 x 10]>  
#   4 Fair  H     D       <tibble [466 x 10]>  
#   5 Fair  I     D       <tibble [338 x 10]>  
#   6 Fair  J     D       <tibble [282 x 10]>  
#   7 Good  E     D       <tibble [1,595 x 10]>
#   8 Good  F     D       <tibble [1,571 x 10]>
#   9 Good  G     D       <tibble [1,533 x 10]>
#  10 Good  H     D       <tibble [1,364 x 10]>
#   # ... with 20 more rows

答案 1 :(得分:1)

如果要将D添加到每个组,则只需在每个组后面附加适当的“ D”

df <- diamonds %>% 
    group_by(cut, color) %>% 
    nest() %>% 
    arrange(cut, color) %>%
    ungroup()%>%
    group_by(cut) %>%
    mutate(append_data = map(data, ~rbind(.x, data[[which(row_number() == 1)]])))

df
#    cut   color data               append_data         
#  1 Fair  D     <tibble [163 × 8]> <tibble [326 × 8]>  
#  2 Fair  E     <tibble [224 × 8]> <tibble [387 × 8]>  
#  3 Fair  F     <tibble [312 × 8]> <tibble [475 × 8]>  
#  4 Fair  G     <tibble [314 × 8]> <tibble [477 × 8]>  
#  5 Fair  H     <tibble [303 × 8]> <tibble [466 × 8]> 
#  # ... with 30 more rows

这应该为您提供一个新的嵌套列,每个组也具有D。

答案 2 :(得分:1)

我们可以按照您的想法在嵌套的data.frames上循环,然后按组添加颜色D的行,然后删除D行:

diamonds %>%
  nest(-cut,-color) %>%
  group_by(cut) %>%
  mutate(data = map(data, ~ bind_rows(data[[which(color=="D")]], .x))) %>%
  ungroup %>%
  filter(color != "D") %>%
  arrange(cut, color)
# # A tibble: 30 x 3
#       cut color                 data
#     <ord> <ord>               <list>
#   1  Fair     E   <tibble [387 x 8]>
#   2  Fair     F   <tibble [475 x 8]>
#   3  Fair     G   <tibble [477 x 8]>
#   4  Fair     H   <tibble [466 x 8]>
#   5  Fair     I   <tibble [338 x 8]>
#   6  Fair     J   <tibble [282 x 8]>
#   7  Good     E <tibble [1,595 x 8]>
#   8  Good     F <tibble [1,571 x 8]>
#   9  Good     G <tibble [1,533 x 8]>
#  10  Good     H <tibble [1,364 x 8]>
#   # ... with 20 more rows