这是前一个问题的一个更复杂的版本,我已经过多地抽象实际问题来应用答案。 R convert tidy hierarchical data frame to hierarchical list
我已经使用for循环将具有两个分组级别的分层数据帧转换为分层列表网格。
是否有更有效的基础R,tidyverse或其他方法来实现这一目标?
在真实数据集中:
第1组和第2组标题位于说明栏中 第1组标题仅出现一次 第2组标题是第1组标题的子项,只有在有新的第2组标题时才会更改 描述是第2组标题的孩子
g1 g2 desc var1 var2 var3
A a d1 KS3 0.0500 2 PLs
A a d2 CTI 0.0500 9 7O0
A b d3 b8x 0.580 5 he2
A b d4 XOf 0.180 12 XJE
A b d5 ygn 0.900 11 v48
A c d6 dGY 0.770 6 UcH
A d d7 jpG 0.600 4 P5M
B d d8 Z95 0.600 10 j6O
desc var1 var2 var3
A
a
d1 KS3 0.0500 2 PLs
d2 CTI 0.0500 9 7O0
b
d3 b8x 0.580 5 he2
d4 XOf 0.180 12 XJE
d5 ygn 0.900 11 v48
c
d6 dGY 0.770 6 UcH
d
d7 jpG 0.600 4 P5M
B
d
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
tib
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = ncol(tib)-1)) %>%
rename(id = V1, desc = V2, var1 = V3, var2 = V4, var3 = V5) %>%
mutate(id = NA_character_,
desc = NA_character_,
var1 = NA_real_,
var2 = NA_integer_,
var3 = NA_character_)
# Loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$id[[output_row]] <- "g1"
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$id[[output_row]] <- "g2"
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data grid
output$desc[[output_row]] <- tib$desc[[i]]
output$var1[[output_row]] <- tib$var1[[i]]
output$var2[[output_row]] <- tib$var2[[i]]
output$var3[[output_row]] <- tib$var3[[i]]
output_row <- output_row + 1
}
output
答案 0 :(得分:0)
调整来自tyluRp R convert tidy hierarchical data frame to hierarchical list的答案我找到了解决方案。
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
# add unique identifier for description and variable rows
tib <-
tib %>%
rowid_to_column() %>%
mutate(rowid = paste0("z_", rowid))
# separate tibble for variables associated with descriptions
tib_var <-
tib %>%
select(rowid, var1, var2, var3)
# code adapted from tyluRp to reorder the data and add description variables
tib <-
tib %>%
select(g1, g2, desc, rowid) %>%
mutate(g2 = paste(g1, g2, sep = "_")) %>%
transpose() %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^z_"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values))%>%
drop_na(values) %>%
select(values, ind) %>%
mutate(values = str_remove(values, "\\D_")) %>%
left_join(tib_var, by = c("ind" = "rowid")) %>%
select(-ind) %>%
replace_na(list(var1 = "", var2 = "", var3 = ""))