我试图按组计算观察值,最小值和最大值,以便用ggplot绘图。我能够通过以下方式做到这一点:
library(tidyverse)
df <- cbind.data.frame("V1"=rnorm(100,10,2),
"V1_std"=rgamma(100,2),
"V2"=rnorm(100,10,2),
"V2_std"=rgamma(100,2))
df_obs <- df %>%
mutate("id"=1:nrow(.)) %>%
select(-c(V1_std,V2_std)) %>%
gather(...=1:2)
df_min <- df %>% mutate("V1_min"= V1 - V1_std) %>%
mutate("V2_min"= V2 - V2_std) %>%
select(V1_min,V2_min) %>%
gather(...=1:2)
df_max <- df %>% mutate("V1_max"= V1 + V1_std) %>%
mutate("V2_max"= V2 + V2_std) %>%
select(V1_max,V2_max) %>%
gather(...=1:2)
df_final <- cbind.data.frame(df_obs,'min'=df_min$value,'max'=df_max$value)
给了我:
key value min max
1 V1 11.411261 7.2531585 15.569364
2 V1 10.804986 10.3518400 11.258132
3 V1 9.809049 8.5992110 11.018887
4 V1 11.225030 10.3028566 12.147204
5 V1 10.532991 10.2911703 10.774812
6 V1 9.993123 8.8835294 11.102717
7 V1 6.891480 4.3241776 9.458782
8 V1 12.492458 11.6869964 13.297920
9 V1 8.986359 5.0335530 12.939165
10 V1 8.667382 8.3316045 9.003160
使用单个管道有更简单的方法吗?
由于
答案 0 :(得分:2)
这是我的尝试。真正的问题是列的命名,这使得很难清楚地分成可用于创建中间长数据集的单独变量:
df %>%
set_names(paste(rep(c("V1","V2"),each=2),c("val","std"),sep="_")) %>%
mutate(id = row_number()) %>%
gather(key,value,-id) %>%
separate(key, c("key","group")) %>%
spread(group, value) %>%
mutate(min=val - std, max=val + std) %>%
select(-std)
# id key val min max
#1 1 V1 7.342068 5.7006317 8.983503
#2 1 V2 7.372698 5.0818045 9.663591
#3 2 V1 13.397766 11.1738412 15.621691
# ...
为了好玩,这里是基础reshape
版本:
names(df) <- paste(c("val","std"),rep(c("Vl","V2"),each=2),sep="_")
df_final <- reshape(df, direction="long", sep="_", varying=TRUE, timevar="key")
df_final <- transform(df_final, min = val-std, max=val+std)[c("id","key","val","min","max")]
答案 1 :(得分:2)
这是我的版本。它有点乱,但如果你添加更多具有相同模式的列,它应该会继续工作。
df %>% mutate(ID=1:n()) %>% gather("key", "value", -ID) %>%
separate(key, c("var", "SD"), sep=2) %>%
mutate(SD=replace(SD, SD == "", "value")) %>% spread("SD", "value") %>%
mutate(min = value - `_std`, max=value - `_std`) %>%
select(-`_std`, -ID) %>% arrange(var)
答案 2 :(得分:2)
还有另一个版本:
bind_rows(df %>% select(v=V1,std=V1_std) %>% mutate(key="V1"),
df %>% select(v=V2,std=V2_std) %>% mutate(key="V2")) %>%
mutate(max = v + std,
min = v - std) %>%
select(key, value=v, min, max)
key value min max 1 V1 8.747092 7.910670 9.583515 2 V1 10.367287 9.526856 11.207717 3 V1 8.328743 6.628955 10.028531 4 V1 13.190562 12.385141 13.995983 5 V1 10.659016 10.317527 11.000504 6 V1 8.359063 5.537228 11.180899
可重复数据:
set.seed(1)
df <- data.frame(V1=rnorm(100,10,2),
V1_std=rgamma(100,2),
V2=rnorm(100,10,2),
V2_std=rgamma(100,2))