如何用R融化多列df?

时间:2019-04-12 12:55:19

标签: r data.table

我有想要将其转换为long的数据。

library(tidyverse)
library(data.table)
#> 
#> Attaching package: 'data.table'
#> The following objects are masked from 'package:dplyr':
#> 
#>     between, first, last
#> The following object is masked from 'package:purrr':
#> 
#>     transpose

df <- structure(list(
  julian_days = c(
    127, 130, 132, 134, 137, 139,
    141, 144, 148, 151, 153, 155, 158, 160, 162, 165, 167, 169, 172,
    NA, NA, NA, NA, NA, NA, NA, NA, NA
  ), sea_ice_algae_last_cm = c(
    0.636,
    0.698, 0.666666666666667, 0.685384615384615, 0.713, 0.6375, 0.58375,
    0.637272727272727, 0.6575, 0.691666666666667, 0.629166666666667,
    0.637142857142857, 0.589166666666667, 0.56, 0.571818181818182,
    0.492, 0.31, 0.312, 0.203076923076923, NA, NA, NA, NA, NA, NA,
    NA, NA, NA
  ), sd = c(
    0.0227058484879019, 0.0369684550213647, 0.0533853912601565,
    0.0525381424324881, 0.0413790070231539, 0.0381682876458741, 0.0277788888666675,
    0.0410099766132362, 0.0222076972732838, 0.0194079021706795, 0.0299873710792131,
    0.0363841933236059, 0.0253908835942542, 0.055746679790749, 0.0604678727620178,
    0.0294957624075053, 0.10770329614269, 0.0657267069006199, 0.0693282789084673,
    NA, NA, NA, NA, NA, NA, NA, NA, NA
  ), julian_days_2 = c(
    127, 130,
    132, 134, 137, 139, 141, 144, 146, 148, 151, 153, 155, 158, 160,
    162, 165, 167, 169, 172, 174, 176, 179, 181, 183, 186, 188, 190
  ), water_1_5_m_depth = c(
    0.69, 0.5475, 0.596, 0.512, 0.598, 0.488333333333333,
    0.27, 0.41, 0.568, 0.503333333333333, 0.668333333333333, 0.71,
    0.636666666666667, 0.623333333333333, 0.66, 0.541666666666667,
    0.57, 0.545, 0.501666666666667, 0.526666666666667, 0.566666666666667,
    0.493333333333333, 0.59, 0.518333333333333, 0.443333333333333,
    0.605, 0.58, 0.478333333333333
  ), sd_2 = c(
    0.121655250605964,
    0.0718215380880506, 0.0736885337077625, 0.0376828873628335, 0.084380092438916,
    0.0636919670497516, 0.054037024344425, 0.0540370243444251, 0.0370135110466435,
    0.0571547606649408, 0.0702614166286638, 0.0442718872423573, 0.0799166232186176,
    0.0480277697448743, 0.0409878030638384, 0.0462240918425302, 0.0920869154657709,
    0.0706399320497981, 0.0511533641774093, 0.100531918646103, 0.0186189867250252,
    0.0588784057755188, 0.0841427358718512, 0.0934701378337842, 0.0492612085384298,
    0.0653452370108182, 0.0878635305459549, 0.0851860708488579
  ),
  water_10_m_depth = c(
    0.66, 0.732, 0.595, 0.712, 0.514, 0.48,
    0.35, 0.44, 0.535, 0.403333333333333, 0.728, 0.746, 0.625,
    0.698333333333333, 0.705, 0.555, 0.585, 0.651666666666667,
    0.603333333333333, 0.595, 0.615, 0.615, 0.658333333333333,
    0.641666666666667, 0.623333333333333, 0.628333333333333,
    0.661666666666667, 0.631666666666667
  ), sd_3 = c(
    0, 0.0342052627529742,
    0.0387298334620742, 0.0327108544675923, 0.0610737259384104,
    0.0700000000000001, 0.127279220613579, 0.0972111104761177,
    0.0564800849857717, 0.0504645089807343, 0.0540370243444252,
    0.0415932686861709, 0.0809320702811933, 0.0475043857624395,
    0.0398748040747538, 0.0568330889535313, 0.0388587184554509,
    0.0204124145231932, 0.058878405775519, 0.0896102672688791,
    0.0535723809439155, 0.0488876262463212, 0.043089055068157,
    0.0306050104830347, 0.0527888877195444, 0.0708284312029193,
    0.0426223728418147, 0.0348807492274272
  ), julian_days_3 = c(
    134,
    137, 139, 141, 146, 148, 153, 155, 160, 162, 165, 169, 172,
    174, 176, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA
  ), water_40_m_depth = c(
    0.523166666666667, 0.360833333333333,
    0.279, 0.228, 0.551166666666667, 0.358666666666667, 0.593,
    0.6225, 0.6665, 0.5468, 0.334714285714286, 0.654, 0.567666666666667,
    0.664166666666667, 0.6345, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA
  ), sd_4 = c(
    0.0793937445058905, 0.0346145441493408,
    0.0834625664594612, 0.105740247777277, 0.0437008771841786,
    0.0810719844747042, 0.0849529281425892, 0.0539620236833275,
    0.0689514321823702, 0.0344992753547085, 0.0889713704621029,
    0.064221491729794, 0.0166933120340652, 0.0545982295195244,
    0.0578472125516865, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA
  ), julian_days_4 = c(
    181, 183, 186, 188, 190, NA,
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA, NA, NA
  ), water_60_m_depth = c(
    0.617833333333333,
    0.492333333333333, 0.642166666666667, 0.7265, 0.686166666666667,
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA, NA, NA, NA
  ), sd_5 = c(
    0.0574818812032684,
    0.049766119666563, 0.0704540039079871, 0.0286618212959331,
    0.0382225936674458, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
  )
), row.names = c(
  NA,
  -28L
), class = c("tbl_df", "tbl", "data.frame"))

arrange(df, desc(julian_days_4)) # Look at the data at day 190
#> # A tibble: 28 x 14
#>    julian_days sea_ice_algae_l…     sd julian_days_2 water_1_5_m_dep…
#>          <dbl>            <dbl>  <dbl>         <dbl>            <dbl>
#>  1         137            0.713 0.0414           137            0.598
#>  2         134            0.685 0.0525           134            0.512
#>  3         132            0.667 0.0534           132            0.596
#>  4         130            0.698 0.0370           130            0.548
#>  5         127            0.636 0.0227           127            0.69 
#>  6         139            0.638 0.0382           139            0.488
#>  7         141            0.584 0.0278           141            0.27 
#>  8         144            0.637 0.0410           144            0.41 
#>  9         148            0.658 0.0222           146            0.568
#> 10         151            0.692 0.0194           148            0.503
#> # … with 18 more rows, and 9 more variables: sd_2 <dbl>,
#> #   water_10_m_depth <dbl>, sd_3 <dbl>, julian_days_3 <dbl>,
#> #   water_40_m_depth <dbl>, sd_4 <dbl>, julian_days_4 <dbl>,
#> #   water_60_m_depth <dbl>, sd_5 <dbl>

我想将所有这些“堆叠”成3列:

julian,所有列均以“ julian”开头 measure,所有列均以“水”或“海”开头 sd,所有列均以“ sd”开头 请注意,在“水”列中,数字表示深度(例如:water_1_5_m_depth表示1.5 m)。 第一行的期望输出如下:

tibble(
  julian = c(127, 127, 127, 134, 181),
  type = c("sea", "water_1.5", "water_10", "water_40", "water_60"),
  measure = c(0.64, 0.69, 0.66, 0.52, 0.62),
  sd = c(0.02, 0.12, 0, 0.08, 0.06)
)
#> # A tibble: 5 x 4
#>   julian type      measure    sd
#>    <dbl> <chr>       <dbl> <dbl>
#> 1    127 sea          0.64  0.02
#> 2    127 water_1.5    0.69  0.12
#> 3    127 water_10     0.66  0   
#> 4    134 water_40     0.52  0.08
#> 5    181 water_60     0.62  0.06

到目前为止,我尝试的是data.table

melt(
  setDT(df),
  measure = patterns("^julian", "^sea", "^water_1_5", "^water_10", "^water_40", "^water_60", "^sd"),
  value.name = c("julian", "sea", "water_1.5", "water_10", "water_40", "water_60", "sd")
)
#>      variable julian       sea water_1.5 water_10  water_40  water_60
#>   1:        1    127 0.6360000    0.6900    0.660 0.5231667 0.6178333
#>   2:        1    130 0.6980000    0.5475    0.732 0.3608333 0.4923333
#>   3:        1    132 0.6666667    0.5960    0.595 0.2790000 0.6421667
#>   4:        1    134 0.6853846    0.5120    0.712 0.2280000 0.7265000
#>   5:        1    137 0.7130000    0.5980    0.514 0.5511667 0.6861667
#>  ---                                                                 
#> 136:        5     NA        NA        NA       NA        NA        NA
#> 137:        5     NA        NA        NA       NA        NA        NA
#> 138:        5     NA        NA        NA       NA        NA        NA
#> 139:        5     NA        NA        NA       NA        NA        NA
#> 140:        5     NA        NA        NA       NA        NA        NA
#>              sd
#>   1: 0.02270585
#>   2: 0.03696846
#>   3: 0.05338539
#>   4: 0.05253814
#>   5: 0.04137901
#>  ---           
#> 136:         NA
#> 137:         NA
#> 138:         NA
#> 139:         NA
#> 140:         NA

任何帮助表示赞赏。

更新:

这是我收到的文件。

enter image description here

reprex package(v0.2.1)于2019-04-12创建

3 个答案:

答案 0 :(得分:1)

如果您共享所需的输出,那就太好了。我想这就是你想要的:

  df %>%
  select(starts_with("julian")) %>%
  gather(key = col, julian) %>%
  bind_cols(df %>%
              select(starts_with("water")) %>%
              gather(col_water, measure)) %>%
    #bind_cols(df %>%
    #            select(starts_with("sea")) %>%
    #            gather(col_sea, measure2)) %>%
    bind_cols(df %>%
                select(starts_with("sd")) %>%
                gather(col_sd, sd)) %>%
  select(julian, measure, sd)


  julian measure     sd
    <dbl>   <dbl>  <dbl>
 1    127   0.69  0.122 
 2    130   0.548 0.0718
 3    132   0.596 0.0737
 4    134   0.512 0.0377
 5    137   0.598 0.0844
 6    139   0.488 0.0637
 7    141   0.27  0.0540
 8    144   0.41  0.0540
 9    148   0.568 0.0370
10    151   0.503 0.0572
# ... with 102 more rows

在这种尝试中,我没有包含以sea开头的变量,因为这会导致一对多合并。让我知道我是否在正确的方向上将其包括在内。

答案 1 :(得分:1)

library(tidyverse)

list_of_dfs <- split.default(df, rep(1:4, c(3, 5, 3, 3)))
list_of_dfs[[5]] <- list_of_dfs[[2]][, c(1, 4, 5)]
list_of_dfs[[2]] <- list_of_dfs[[2]][, 1:3]

list_of_dfs %>% 
  map(~ .[complete.cases(.), ]) %>% 
  map(~ mutate(., type = grep("^sea|^water", names(.), value = TRUE))) %>% 
  map(setNames, nm = c("julian", "measure", "sd", "type")) %>% 
  bind_rows()

# # A tibble: 95 x 4
#    julian measure     sd type                 
#     <dbl>   <dbl>  <dbl> <chr>                
#  1    127   0.636 0.0227 sea_ice_algae_last_cm
#  2    130   0.698 0.0370 sea_ice_algae_last_cm
#  3    132   0.667 0.0534 sea_ice_algae_last_cm
#  4    134   0.685 0.0525 sea_ice_algae_last_cm
#  5    137   0.713 0.0414 sea_ice_algae_last_cm
#  6    139   0.638 0.0382 sea_ice_algae_last_cm
#  7    141   0.584 0.0278 sea_ice_algae_last_cm
#  8    144   0.637 0.0410 sea_ice_algae_last_cm
#  9    148   0.658 0.0222 sea_ice_algae_last_cm
# 10    151   0.692 0.0194 sea_ice_algae_last_cm
# # … with 85 more rows

答案 2 :(得分:0)

data.table::melt(
  df,
  measure.vars = patterns("^julian"),
  variable.name = "julian_variable",
  value.name = "julian_value"
) %>% 
  data.table::melt(
    measure.vars = patterns(measure = "^sea|^water"),
    variable.name = "measure_variable",
    value.name = "measure_value"
  ) %>% 
  data.table::melt(
    measure.vars = patterns(measure = "^sd"),
    variable.name = "sd_variable",
    value.name = "sd_value"
  )

#       julian_variable julian_value      measure_variable measure_value sd_variable   sd_value
#    1:     julian_days          127 sea_ice_algae_last_cm     0.6360000          sd 0.02270585
#    2:     julian_days          130 sea_ice_algae_last_cm     0.6980000          sd 0.03696846
#    3:     julian_days          132 sea_ice_algae_last_cm     0.6666667          sd 0.05338539
#    4:     julian_days          134 sea_ice_algae_last_cm     0.6853846          sd 0.05253814
#    5:     julian_days          137 sea_ice_algae_last_cm     0.7130000          sd 0.04137901
#   ---                                                                                        
# 2796:   julian_days_4           NA      water_60_m_depth            NA        sd_5         NA
# 2797:   julian_days_4           NA      water_60_m_depth            NA        sd_5         NA
# 2798:   julian_days_4           NA      water_60_m_depth            NA        sd_5         NA
# 2799:   julian_days_4           NA      water_60_m_depth            NA        sd_5         NA
# 2800:   julian_days_4           NA      water_60_m_depth            NA        sd_5         NA

尽管不清楚所需的输出是什么。这种解决方案显然会导致很多重复(基本上,每个值重复100次!4个“ julian”列* 5个“ measure”列* 5个“ sd”列)。