r - 在tidyverse中使用单个管道收集多个键

时间:2017-08-31 21:58:05

标签: r dplyr tidyverse

我试图按组计算观察值,最小值和最大值,以便用ggplot绘图。我能够通过以下方式做到这一点:

library(tidyverse)
df <- cbind.data.frame("V1"=rnorm(100,10,2),
                       "V1_std"=rgamma(100,2),
                       "V2"=rnorm(100,10,2),
                       "V2_std"=rgamma(100,2))
df_obs <- df %>%
  mutate("id"=1:nrow(.)) %>%
  select(-c(V1_std,V2_std)) %>%
  gather(...=1:2)

df_min <- df %>%  mutate("V1_min"= V1 - V1_std) %>%
  mutate("V2_min"= V2 - V2_std) %>% 
  select(V1_min,V2_min) %>%
  gather(...=1:2)

df_max <- df %>%  mutate("V1_max"= V1 + V1_std) %>%
  mutate("V2_max"= V2 + V2_std) %>% 
  select(V1_max,V2_max) %>%
  gather(...=1:2)

df_final <- cbind.data.frame(df_obs,'min'=df_min$value,'max'=df_max$value)

给了我:

    key     value      min       max
1    V1 11.411261  7.2531585 15.569364
2    V1 10.804986 10.3518400 11.258132
3    V1  9.809049  8.5992110 11.018887
4    V1 11.225030 10.3028566 12.147204
5    V1 10.532991 10.2911703 10.774812
6    V1  9.993123  8.8835294 11.102717
7    V1  6.891480  4.3241776  9.458782
8    V1 12.492458 11.6869964 13.297920
9    V1  8.986359  5.0335530 12.939165
10   V1  8.667382  8.3316045  9.003160

使用单个管道有更简单的方法吗?

由于

3 个答案:

答案 0 :(得分:2)

这是我的尝试。真正的问题是列的命名,这使得很难清楚地分成可用于创建中间长数据集的单独变量:

df %>% 
  set_names(paste(rep(c("V1","V2"),each=2),c("val","std"),sep="_")) %>% 
  mutate(id = row_number()) %>% 
  gather(key,value,-id) %>%
  separate(key, c("key","group")) %>%
  spread(group, value) %>%
  mutate(min=val - std, max=val + std) %>%
  select(-std)

#     id key       val        min       max
#1     1  V1  7.342068  5.7006317  8.983503
#2     1  V2  7.372698  5.0818045  9.663591
#3     2  V1 13.397766 11.1738412 15.621691
# ...

为了好玩,这里是基础reshape版本:

names(df) <- paste(c("val","std"),rep(c("Vl","V2"),each=2),sep="_")
df_final <- reshape(df, direction="long", sep="_", varying=TRUE, timevar="key")
df_final <- transform(df_final, min = val-std, max=val+std)[c("id","key","val","min","max")]

答案 1 :(得分:2)

这是我的版本。它有点乱,但如果你添加更多具有相同模式的列,它应该会继续工作。

df %>% mutate(ID=1:n()) %>% gather("key", "value", -ID) %>% 
  separate(key, c("var", "SD"), sep=2) %>% 
  mutate(SD=replace(SD, SD == "", "value")) %>% spread("SD", "value") %>% 
  mutate(min = value - `_std`, max=value - `_std`) %>% 
  select(-`_std`, -ID) %>% arrange(var)

答案 2 :(得分:2)

还有另一个版本:

bind_rows(df %>% select(v=V1,std=V1_std) %>% mutate(key="V1"),
          df %>% select(v=V2,std=V2_std) %>% mutate(key="V2")) %>% 
  mutate(max = v + std,
         min = v - std) %>% 
  select(key, value=v, min, max)
    key     value       min       max
1    V1  8.747092  7.910670  9.583515
2    V1 10.367287  9.526856 11.207717
3    V1  8.328743  6.628955 10.028531
4    V1 13.190562 12.385141 13.995983
5    V1 10.659016 10.317527 11.000504
6    V1  8.359063  5.537228 11.180899

可重复数据:

set.seed(1)
df <- data.frame(V1=rnorm(100,10,2),
                 V1_std=rgamma(100,2),
                 V2=rnorm(100,10,2),
                 V2_std=rgamma(100,2))