有条件地将Reshape2应用于数据框

时间:2019-07-27 14:48:05

标签: r reshape2

我有一个示例数据框,如下所示

    V1        V2
   TEST_1     
   TEST_1a    AA 
   TEST_2     
   TEST_2a    BB 
   TEST_3     
   TEST_4     
   TEST_4a    CC
   TEST_4b    DD

我想根据第V1列中的_ **值将此数据帧转换为如下所示的内容

Col1a    Col1b   Col2a    Col2b   Col3a   Col3b  Col4a   Col4b

TEST_1           TEST_2           TEST_3         TEST_4   
TEST_1a   AA     TEST_2a    BB                   TEST_4a  CC
                                                 TEST_4b  DD

以此类推。

我已经研究过reshape2函数,但似乎没有一个适合我的需求。任何帮助都非常感激

1 个答案:

答案 0 :(得分:0)

一个选项是dcast。根据显示的输出,我们可能需要单独进行重塑

library(data.table)
library(gtools)
setDT(df1)
df1[, grp := paste0("Col", rleid(sub("[a-z]+$", "", V1)))
    ][, c("grp1", "grp2") := .(paste0(grp, "a"), paste0(grp, "b"))
     ][]
out <- cbind(dcast(df1, rowid(grp1) ~ grp1, value.var = "V1", fill = ""), 
     dcast(df1, rowid(grp2) ~ grp2, value.var = "V2", fill = ""))[, 
               c('grp1', 'grp2') := NULL][]
setcolorder(out,  mixedsort(names(out)))
out
#     Col1a Col1b   Col2a Col2b  Col3a Col3b   Col4a Col4b
#1:  TEST_1        TEST_2       TEST_3        TEST_4      
#2: TEST_1a    AA TEST_2a    BB              TEST_4a    CC
#3:                                          TEST_4b    DD

或使用tidyverse

library(dplyr)
library(tidyr)
df1 %>%
    mutate(grp = str_remove(V1, "[a-z]$"),
           grp = str_c("Col", group_indices(., grp)),
           grp1 = str_c(grp, "a"), 
           grp2 = str_c(grp, 'b')) %>% 
    {list(select(., V1, grp1), select(., V2, grp2))} %>% 
    map(~ .x %>% 
              group_by_at(2) %>%
              mutate(rn = row_number()) %>% 
              ungroup %>%
              spread(!! rlang::sym(names(.)[2]), 
                     !! rlang::sym(names(.)[1]), fill = "") ) %>%
   reduce(inner_join, by = 'rn') %>%
   select(-rn) %>% 
   select(mixedsort(names(.)))
# A tibble: 3 x 8
#  Col1a   Col1b Col2a   Col2b Col3a  Col3b Col4a   Col4b
#  <chr>   <chr> <chr>   <chr> <chr>  <chr> <chr>   <chr>
#1 TEST_1  ""    TEST_2  ""    TEST_3 ""    TEST_4  ""   
#2 TEST_1a AA    TEST_2a BB    ""     ""    TEST_4a CC   
#3 ""      ""    ""      ""    ""     ""    TEST_4b DD   

数据

df1 <- structure(list(V1 = c("TEST_1", "TEST_1a", "TEST_2", "TEST_2a", 
    "TEST_3", "TEST_4", "TEST_4a", "TEST_4b"), V2 = c("", "AA", "", 
    "BB", "", "", "CC", "DD")), row.names = c(NA, -8L), class = "data.frame")