如何将整洁的分层数据帧转换为R中的分层列表网格?

时间:2018-03-18 17:02:25

标签: r hierarchical-data

这是前一个问题的一个更复杂的版本,我已经过多地抽象实际问题来应用答案。 R convert tidy hierarchical data frame to hierarchical list

我已经使用for循环将具有两个分组级别的分层数据帧转换为分层列表网格。

是否有更有效的基础R,tidyverse或其他方法来实现这一目标?

在真实数据集中:

  • 分组变量和描述是多字符串。
  • 描述前言 - d# - 位于MWE中以便于检查。
  • 有14种不同类型的关联变量:character,integer和double

规则

第1组和第2组标题位于说明栏中 第1组标题仅出现一次 第2组标题是第1组标题的子项,只有在有新的第2组标题时才会更改 描述是第2组标题的孩子

从此

g1    g2    desc    var1       var2   var3 
A     a     d1 KS3  0.0500     2      PLs  
A     a     d2 CTI  0.0500     9      7O0  
A     b     d3 b8x  0.580      5      he2  
A     b     d4 XOf  0.180     12      XJE  
A     b     d5 ygn  0.900     11      v48  
A     c     d6 dGY  0.770      6      UcH  
A     d     d7 jpG  0.600      4      P5M  
B     d     d8 Z95  0.600     10      j6O  

到此

 desc      var1      var2  var3 
 A         
 a       
 d1 KS3   0.0500     2     PLs  
 d2 CTI   0.0500     9     7O0  
 b       
 d3 b8x   0.580      5     he2  
 d4 XOf   0.180     12     XJE  
 d5 ygn   0.900     11     v48  
 c        
d6 dGY   0.770      6     UcH  
 d          
d7 jpG   0.600      4     P5M  
 B       
 d       

代码

library(tidyverse)
library(stringi)

set.seed(2018) 

tib <-  tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
               g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
               desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
               var1 = round(runif(12), 2),
               var2 = sample.int(12),
               var3 = stri_rand_strings(12, 3))


tib

# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)

# create empty output tibble
output <- 
  as_tibble(matrix(nrow = n_rows, ncol = ncol(tib)-1)) %>% 
  rename(id = V1, desc = V2, var1 = V3, var2 = V4, var3 = V5) %>% 
  mutate(id = NA_character_,
         desc = NA_character_,
         var1 = NA_real_,
         var2 = NA_integer_,
         var3 = NA_character_)

# Loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1

for(i in seq_len(nrow(tib))){

  # level 1 headings
  if(tib$g1[[i]] != level_1) {
    output$id[[output_row]] <- "g1"
    output$desc[[output_row]] <- tib$g1[[i]]
    output_row <- output_row + 1
  }

  # level 2 headings
  if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
    output$id[[output_row]] <- "g2"
    output$desc[[output_row]] <- tib$g2[[i]]
    output_row <- output_row + 1
  }

  level_1 <- tib$g1[[i]]  
  level_2 <- tib$g2[[i]] 

  # Description and data grid
  output$desc[[output_row]] <- tib$desc[[i]]
  output$var1[[output_row]] <- tib$var1[[i]]
  output$var2[[output_row]] <- tib$var2[[i]]
  output$var3[[output_row]] <- tib$var3[[i]]
  output_row <- output_row + 1

}

output

1 个答案:

答案 0 :(得分:0)

调整来自tyluRp R convert tidy hierarchical data frame to hierarchical list的答案我找到了解决方案。

library(tidyverse)
library(stringi)

set.seed(2018) 

tib <-  tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),

               g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
               desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
               var1 = round(runif(12), 2),
               var2 = sample.int(12),
               var3 = stri_rand_strings(12, 3))

# add unique identifier for description and variable rows
tib <- 
  tib %>%
  rowid_to_column() %>% 
  mutate(rowid = paste0("z_", rowid))

# separate tibble for variables associated with descriptions
tib_var <- 
  tib %>% 
  select(rowid, var1, var2, var3)

# code adapted from tyluRp to reorder the data and add description variables 
tib <- 
  tib %>%
  select(g1, g2, desc, rowid) %>% 
  mutate(g2 = paste(g1, g2, sep = "_")) %>% 
  transpose() %>% 
  unlist() %>% 
  stack() %>% 
  distinct(values, ind) %>% 
  mutate(detect_var = str_detect(values, "^z_"),
         ind = lead(case_when(detect_var == TRUE ~ values)),
         values = case_when(detect_var == TRUE ~ NA_character_,
                            TRUE ~ values))%>% 
  drop_na(values) %>% 
  select(values, ind) %>% 
  mutate(values = str_remove(values, "\\D_")) %>% 
  left_join(tib_var, by = c("ind" = "rowid")) %>% 
  select(-ind) %>% 
  replace_na(list(var1 = "", var2 = "", var3 = ""))