使用collect()创建单独的整洁列

时间:2019-09-06 11:14:56

标签: r tidyr

我有姿势血压数据(在3个时间点的脉搏,收缩压和舒张压),以及一些其他关键变量,并且想要创建整齐的血压数据。

我曾尝试对单个变量类型(例如收缩压)使用collect(),但在全部3个变量类型上都不能使用(59个原始参与者,将3个collect()函数一起组合时变为1593)。

library(tidyverse)

ID <- c(1138, 430, 245, 1431, 1587)
group <- c(1, 1, 2, 3, 4)
measure_1 <- c(8, 14, 6, 11, 28)
systolic_lying <- c(169, 141, 144, 120, 88)
systolic_standing_1 <- c(163, 156, 129, 127, 102)
systolic_standing_3 <- c(179, 152, 146, 127, 106)
diastolic_lying <- c(80, 93, 80, 71, 66)
diastolic_standing_1 <-c(87, 97, 79, 77, 75)
diastolic_standing_3 <-c(92, 100, 83, 79, 77)
pulse_lying <-c(58, 71, 58, 63, 98)
pulse_lying_1 <- c(62, 93, 61, 70, 73)
pulse_lying_3 <- c(67, 97, 63, 71, 76)

bp <- tibble(ID, group, measure_1, systolic_lying, systolic_standing_1, systolic_standing_3, diastolic_lying, diastolic_standing_1, diastolic_standing_3, pulse_lying, pulse_lying_1, pulse_lying_3)

head(bp)
# A tibble: 5 x 12
     ID group measure_1 systolic_lying systolic_standi… systolic_standi…
  <dbl> <dbl>     <dbl>          <dbl>            <dbl>            <dbl>
1  1138     1         8            169              163              179
2   430     1        14            141              156              152
3   245     2         6            144              129              146
4  1431     3        11            120              127              127
5  1587     4        28             88              102              106
# … with 6 more variables: diastolic_lying <dbl>,
#   diastolic_standing_1 <dbl>, diastolic_standing_3 <dbl>,
#   pulse_lying <dbl>, pulse_lying_1 <dbl>, pulse_lying_3 <dbl>

仅在其中一个变量上使用聚集,例如

tidybp <- gather(bloodpressure, "systolic_posture", "systolic", systolic_lying:systolic_standing_3)

dim(tidybp)
[1] 15 11 #This is what I expect.

head(tidybp[c(1:3, 10:11)])
# A tibble: 6 x 5
     ID group measure_1 systolic_posture    systolic
  <dbl> <dbl>     <dbl> <chr>                  <dbl>
1  1587     4        28 systolic_lying            88
2  1138     1         8 systolic_standing_1      163
3   430     1        14 systolic_standing_1      156
4   245     2         6 systolic_standing_1      129
5  1431     3        11 systolic_standing_1      127
6  1587     4        28 systolic_standing_1      102

但是,这仍然使所有diastolic_lying,diastolic_standing_1 ... pulse_standing_1 pulse_standing_3仍然散开。

然后我拿起tidybp并用聚会将例如舒张期变量在一起,它使循环收集的收缩数据的df长度增加了三倍。

tidybp <- gather(bp, "systolic_posture", "systolic", systolic_lying:systolic_standing_3)
> tidybp <- gather(tidybp, "diastolic_posture", "diastolic", diastolic_lying:diastolic_standing_3) 
> tidybp <- gather(tidybp, "pulse_posture", "pulse", pulse_lying:pulse_standing_3)
> dim(tidybp)
[1] 135   9 #each subsequent iteration triples the number of observations

我的目标是:


tidybp
# A tibble: 15 x 9
      ID group measure_1 systolic_posture    systolic diastolic_posture diastolic pulse_posture pulse
   <dbl> <dbl>     <dbl> <chr>                  <dbl> <chr>                 <dbl> <chr>         <dbl>
 1  1138     1         8 systolic_lying           169
 2   430     1        14 systolic_lying           141
 3   245     2         6 systolic_lying           144
 4  1431     3        11 systolic_lying           120
 5  1587     4        28 systolic_lying            88
 6  1138     1         8 systolic_standing_1      163
 7   430     1        14 systolic_standing_1      156
 8   245     2         6 systolic_standing_1      129
 9  1431     3        11 systolic_standing_1      127
10  1587     4        28 systolic_standing_1      102
11  1138     1         8 systolic_standing_3      179
12   430     1        14 systolic_standing_3      152
13   245     2         6 systolic_standing_3      146
14  1431     3        11 systolic_standing_3      127
15  1587     4        28 systolic_standing_3      106

带有相应的舒张期和脉搏键值对。

不胜感激。

4 个答案:

答案 0 :(得分:0)

这可以通过tidyverse来完成,但是使用良好的旧base::reshape,恕我直言要容易得多:

v <- list(c("diastolic_lying", "diastolic_standing_1", "diastolic_standing_3"), 
          c("pulse_lying", "pulse_lying_1", "pulse_lying_3"), 
          c("systolic_lying", "systolic_standing_1", "systolic_standing_3"))

reshape(as.data.frame(bp), 
        v, 
        direction = "long", 
        v.names   = c("diastolic", "pulse", "systolic"), 
        times     = c("lying", "standing_1", "standing_3"), 
        idvar     = "ID", 
        timevar   = "type") %>% 
as_tibble()
# A tibble: 15 x 7
#       ID group measure_1 type       diastolic pulse systolic
#    <dbl> <dbl>     <dbl> <chr>          <dbl> <dbl>    <dbl>
#  1  1138     1         8 lying             80    58      169
#  2   430     1        14 lying             93    71      141
#  3   245     2         6 lying             80    58      144
#  4  1431     3        11 lying             71    63      120
#  5  1587     4        28 lying             66    98       88
#  6  1138     1         8 standing_1        87    62      163
#  7   430     1        14 standing_1        97    93      156
#  8   245     2         6 standing_1        79    61      129
#  9  1431     3        11 standing_1        77    70      127
# 10  1587     4        28 standing_1        75    73      102
# 11  1138     1         8 standing_3        92    67      179
# 12   430     1        14 standing_3       100    97      152
# 13   245     2         6 standing_3        83    63      146
# 14  1431     3        11 standing_3        79    71      127
# 15  1587     4        28 standing_3        77    76      106

答案 1 :(得分:0)

使用tidyverse并不难实现,而且它不会依赖参数,因此,当数据更改(其他度量)时,代码将继续工作。


library(tidyverse)

ID <- c(1138, 430, 245, 1431, 1587)
group <- c(1, 1, 2, 3, 4)
measure_1 <- c(8, 14, 6, 11, 28)
systolic_lying <- c(169, 141, 144, 120, 88)
systolic_standing_1 <- c(163, 156, 129, 127, 102)
systolic_standing_3 <- c(179, 152, 146, 127, 106)
diastolic_lying <- c(80, 93, 80, 71, 66)
diastolic_standing_1 <-c(87, 97, 79, 77, 75)
diastolic_standing_3 <-c(92, 100, 83, 79, 77)
pulse_lying <-c(58, 71, 58, 63, 98)
pulse_standing_1 <- c(62, 93, 61, 70, 73)
pulse_standing_3 <- c(67, 97, 63, 71, 76)

bp <- tibble(ID, group, measure_1, systolic_lying, systolic_standing_1, systolic_standing_3, 
             diastolic_lying, diastolic_standing_1, diastolic_standing_3, 
             pulse_lying, pulse_standing_1, pulse_standing_3)

bp %>%
  gather(key, value, -ID, -group,-measure_1) %>%
  mutate(posture = str_extract(key,'(?<=[a-z]{1,20}_).+'),
         measure = str_extract(key,'^[a-z]+(?=_)')) %>%
  group_by(ID, group,measure_1,posture) %>%
  select(-key) %>%
  spread(measure, value) %>%
  arrange(posture)

我确实将pulse_lying_1...3更改为站立状态。否则,它们将以预期的不同行结尾。

结果:


# A tibble: 15 x 7
# Groups:   ID, group, measure_1, posture [15]
      ID group measure_1 posture    diastolic pulse systolic
   <dbl> <dbl>     <dbl> <chr>          <dbl> <dbl>    <dbl>
 1   245     2         6 lying             80    58      144
 2   430     1        14 lying             93    71      141
 3  1138     1         8 lying             80    58      169
 4  1431     3        11 lying             71    63      120
 5  1587     4        28 lying             66    98       88
 6   245     2         6 standing_1        79    61      129
 7   430     1        14 standing_1        97    93      156
 8  1138     1         8 standing_1        87    62      163
 9  1431     3        11 standing_1        77    70      127
10  1587     4        28 standing_1        75    73      102
11   245     2         6 standing_3        83    63      146
12   430     1        14 standing_3       100    97      152
13  1138     1         8 standing_3        92    67      179
14  1431     3        11 standing_3        79    71      127
15  1587     4        28 standing_3        77    76      106

答案 2 :(得分:0)

编辑:我的原始解决方案使用left_join产生不良影响,制作了许多数据副本。在下面更正。

我认为这是您正在寻找的输出,在此我们可以看到所有三个姿势以及每个姿势的配对值。

在这种情况下,我使用bind_cols将三个键值对组合为您所描述的格式。

select(bp, ID:measure_1, systolic_lying:systolic_standing_3) %>%
gather("systolic_posture", "systolic", -c(ID:measure_1)) %>%

bind_cols(select(bp, ID:measure_1, diastolic_lying:diastolic_standing_3) %>%
            gather("diastolic_posture", "diastolic", -c(ID:measure_1)) %>%
            select(-c(ID:measure_1))) %>%

bind_cols(select(bp, ID:measure_1, pulse_lying:pulse_lying_3) %>%
            gather("pulse_posture", "pulse", -c(ID:measure_1)) %>%
            select(-c(ID:measure_1)))


# A tibble: 15 x 9
   ID group measure_1 systolic_posture    systolic diastolic_posture    diastolic pulse_posture pulse
   <dbl> <dbl>     <dbl> <chr>                  <dbl> <chr>                    <dbl> <chr>         <dbl>
 1  1138     1         8 systolic_lying           169 diastolic_lying             80 pulse_lying      58
 2   430     1        14 systolic_lying           141 diastolic_lying             93 pulse_lying      71
 3   245     2         6 systolic_lying           144 diastolic_lying             80 pulse_lying      58
 4  1431     3        11 systolic_lying           120 diastolic_lying             71 pulse_lying      63
 5  1587     4        28 systolic_lying            88 diastolic_lying             66 pulse_lying      98
 6  1138     1         8 systolic_standing_1      163 diastolic_standing_1        87 pulse_lying_1    62
 7   430     1        14 systolic_standing_1      156 diastolic_standing_1        97 pulse_lying_1    93
 8   245     2         6 systolic_standing_1      129 diastolic_standing_1        79 pulse_lying_1    61
 9  1431     3        11 systolic_standing_1      127 diastolic_standing_1        77 pulse_lying_1    70
10  1587     4        28 systolic_standing_1      102 diastolic_standing_1        75 pulse_lying_1    73
11  1138     1         8 systolic_standing_3      179 diastolic_standing_3        92 pulse_lying_3    67
12   430     1        14 systolic_standing_3      152 diastolic_standing_3       100 pulse_lying_3    97
13   245     2         6 systolic_standing_3      146 diastolic_standing_3        83 pulse_lying_3    63
14  1431     3        11 systolic_standing_3      127 diastolic_standing_3        79 pulse_lying_3    71
15  1587     4        28 systolic_standing_3      106 diastolic_standing_3        77 pulse_lying_3    76

答案 3 :(得分:0)

使用 tidyr 1.0.0 ,您可以执行以下操作:

library(tidyr)

pivot_longer(bp, -(1:3),names_to = c(".value","posture"), names_pattern="([^_]+)_(.+)")
#> # A tibble: 25 x 7
#>       ID group measure_1 posture    systolic diastolic pulse
#>    <dbl> <dbl>     <dbl> <chr>         <dbl>     <dbl> <dbl>
#>  1  1138     1         8 lying           169        80    58
#>  2  1138     1         8 standing_1      163        87    NA
#>  3  1138     1         8 standing_3      179        92    NA
#>  4  1138     1         8 lying_1          NA        NA    62
#>  5  1138     1         8 lying_3          NA        NA    67
#>  6   430     1        14 lying           141        93    71
#>  7   430     1        14 standing_1      156        97    NA
#>  8   430     1        14 standing_3      152       100    NA
#>  9   430     1        14 lying_1          NA        NA    93
#> 10   430     1        14 lying_3          NA        NA    97
#> # ... with 15 more rows