Pivot_longer将单个列分成多个列

时间:2020-07-09 18:31:30

标签: r tidyverse tidyr

我从一个同事那里继承了一个数据集,我想在R中以长格式(当前为宽格式数据)进行分析。这是数据的子集:

    structure(list(Phase = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 
    4L), .Label = c("1_baseline", "2_first_10mins", "3_second_10mins", 
    "4_final_10mins"), class = "factor"), MaleID = c(1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L), Weight_g = c(48.91, 84.81, 48.91, 84.81, 
    48.91, 84.81, 48.91, 84.81), FemaleID = c(1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L), Female_side = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L), .Label = c("Left", "Right"), class = "factor"), Date = structure(c(2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("7/11/2016 10:53", "7/11/2016 9:26"
    ), class = "factor"), Trx = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = "G1", class = "factor"), Center_All_TotalDistMoved = c(2385, 
    3284.86, 1612.03, 2656.36, 1674.68, 1174.75, 1543.54, 629.412
    ), Nose_Left_CumDur = c(112.64, 248.32, 64.24, 91.12, 82.56, 
    22.8, 71.88, 16.48), Nose_Left_FreqZoneEntry = c(260L, 602L, 
    60L, 208L, 77L, 16L, 139L, 10L), Nose_Left_LatencyFirstEntry = c(4.6, 
    0.08, 1571.84, 1560, 2168.16, 2161.96, 2776.28, 2790.72), Nose_Right_CumDur = c(92.4, 
    40.48, 194.88, 426.96, 188.76, 500.44, 93.92, 559.56), Nose_Right_FreqZoneEntry = c(174L, 
    100L, 257L, 229L, 277L, 159L, 182L, 54L), Nose_Right_LatencyFirstEntry = c(0.12, 
    117.44, 1565.2, 1660.72, 2160, 2165.4, 2790.2, 2760.76), Nose_Dead_CumDur = c(327.68, 
    112.88, 176.84, 25.16, 143.44, 45.32, 314.48, 12.16), Nose_Dead_FreqZoneEntry = c(536L, 
    722L, 370L, 142L, 437L, 148L, 455L, 52L), Nose_Dead_LatencyFirstEntry = c(0, 
    0, 1562.92, 1684.04, 2160.6, 2160, 2760, 2760), Center_Left_CumDur = c(73.2, 
    260.96, 37.24, 102.68, 46.16, 20.32, 32.92, 17.52), Center_Left_FreqZoneEntry = c(23L, 
    74L, 9L, 8L, 13L, 4L, 14L, 1L), Center_Left_LatencyFirstEntry = c(108.16, 
    0, 1792.32, 1560, 2170.76, 2162.8, 2804.04, 2792.4), Center_Right_CumDur = c(63.16, 
    25.44, 154, 415.56, 131.16, 475.4, 64.28, 541.68), Center_Right_FreqZoneEntry = c(10L, 
    8L, 26L, 26L, 26L, 22L, 29L, 13L), Center_Right_LatencyFirstEntry = c(121.64, 
    139.04, 1673.12, 1661.84, 2160, 2166.08, 2790.72, 2760), Center_Dead_CumDur = c(441.64, 
    101.76, 243.56, 62.76, 198, 87.88, 430.28, 32.4), Center_Dead_FreqZoneEntry = c(34L, 
    72L, 38L, 23L, 37L, 26L, 46L, 8L), Center_Dead_LatencyFirstEntry = c(0, 
    20.28, 1560, 1706.4, 2160.96, 2160, 2760, 2770.84), Center_All_MeanVelocity = c(3.98271, 
    5.4766, 2.68744, 4.42786, 2.79113, 1.95818, 2.57738, 1.05056)), class = "data.frame", row.names = c(NA, 
    -8L))

在每列(具体而言,第8-27列)中,我要提取3个因素/测量值:

  1. 点(变量=中心;鼻子);
  2. 区域(变量=左;右;死);
  3. 测量(几个变量:TotalDistMoved,CumDur,FreqZoneEntry,LatencyFirstEntry,MeanVelocity)。

基于此处的两篇类似文章(onetwo),我尝试对我的数据子集使用以下代码(首先只是一列,然后进行扩展),但是我我对正则表达式感到非常困惑,我认为这是我的问题的一部分(不确定如何基于“ _”作为分隔符来解析3列/值):

    df %>% 
      pivot_longer(Center_All_TotalDistMoved,
                   names_to = c("Point", ".value"),
                   names_pattern = "([^_]+)_(.*)")

这做得很好,并解析出“积分”因素:

    # A tibble: 151 x 9
       Phase      MaleID Weight_g      FemaleID Female_side Date            Trx   Point  All_TotalDistMoved
       <fct>       <int> <fct>            <int> <fct>       <fct>           <fct> <chr>               <dbl>
     1 1_baseline      1 48.91                1 Right       7/11/2016 9:26  G1    Center              2385 
     2 1_baseline      2 84.81                2 Right       7/11/2016 10:53 G1    Center              3285.
     3 1_baseline      3 VERIFY_NUMBER        3 Right       7/12/2016 9:50  VEH   Center              2397.
     4 1_baseline      4 69.15                1 Left        7/12/2016 12:32 VEH   Center              4011.
     5 1_baseline      5 86.15                2 Left        7/13/2016 9:52  G1    Center              2746.
     6 1_baseline      6 60.17                3 Left        7/13/2016 11:30 G1    Center              1994.
     7 1_baseline      7 46.82                1 Left        7/14/2016 10:10 VEH   Center              2910.
     8 1_baseline      8 35.52                2 Left        7/14/2016 11:59 VEH   Center              1892.
     9 1_baseline      9 29.92                3 Right       7/15/2016 9:54  G1    Center              4823.
    10 1_baseline     10 28.96                1 Left        7/15/2016 11:32 G1    Center              2119.

但是,由于正则表达式的无知,我无法在没有错误的情况下将上述代码向上扩展到一栏中。此外,我不确定如何在其余各列中执行相同的操作,并且仍将数据与MaleIDPhase匹配。 理想情况下,数据框应该是这样

  MaleID Weight_g Treatment    Phase FemaleID FemaleLocation           Date  Point  Zone TotalDistMoved CumDur FreqZoneEntry LatencyFirstEntry MeanVelocity
1      1    48.91        G1 Baseline        1          Right 7/11/2016 9:26 Center   All           2385     NA            NA                NA      3.98271
2      1    48.91        G1 Baseline        1          Right 7/11/2016 9:26   Nose  Left             NA 112.64           260              4.60           NA
3      1    48.91        G1 Baseline        1          Right 7/11/2016 9:26   Nose Right             NA  92.40           174              0.12           NA
4      1    48.91        G1 Baseline        1          Right 7/11/2016 9:26   Nose  Dead             NA 327.68           536              0.00           NA
5      1    48.91        G1 Baseline        1          Right 7/11/2016 9:26 Center  Left             NA  73.20            23            108.16           NA
6      1    48.91        G1 Baseline        1          Right 7/11/2016 9:26 Center Right             NA  63.16            10            121.64           NA

谢谢!

1 个答案:

答案 0 :(得分:1)

尝试一下:

代码说明:

  1. 当您指定将列名分为三个变量时:点,区域,测量。您可以使用names_to参数使用这些名称创建三列。
  2. names_pattern参数中,使用(.*)_(.*)_(.*)可以提取三个分隔符。请注意,此正则表达式假定_在列名中仅出现两次。 .*与任何单词匹配,而()用于提取单词。
library(tidyr)
library(dplyr)

df %>% 
    pivot_longer(cols = c(8:27), 
                 names_to = c('point', 'zone', 'measurement'), 
                 names_pattern = '(.*)_(.*)_(.*)', 
                 values_to = 'amount') -> df2
df2
#> # A tibble: 160 x 11
#>    Phase MaleID Weight_g FemaleID Female_side Date  Trx   point zone 
#>    <fct>  <int>    <dbl>    <int> <fct>       <fct> <fct> <chr> <chr>
#>  1 1_ba~      1     48.9        1 Right       7/11~ G1    Cent~ All  
#>  2 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Left 
#>  3 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Left 
#>  4 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Left 
#>  5 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Right
#>  6 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Right
#>  7 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Right
#>  8 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Dead 
#>  9 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Dead 
#> 10 1_ba~      1     48.9        1 Right       7/11~ G1    Nose  Dead 
#> # ... with 150 more rows, and 2 more variables: measurement <chr>, amount <dbl>

reprex package(v0.3.0)于2020-07-10创建