我有以下几点:
df <- structure(list(treatment = c("control", "control", "control",
"control", "control", "control", "treated", "treated", "treated",
"treated", "treated", "treated"), `0610005C13Rik` = c(5L, 2L,
2L, 5L, 1L, 0L, 6L, 1L, 0L, 5L, 1L, 2L), `0610007P14Rik` = c(300L,
249L, 166L, 104L, 248L, 136L, 164L, 121L, 191L, 187L, 289L, 169L
), `0610009B22Rik` = c(251L, 158L, 92L, 82L, 239L, 107L, 147L,
96L, 153L, 200L, 211L, 80L), `0610009L18Rik` = c(42L, 17L, 16L,
17L, 10L, 6L, 18L, 1L, 15L, 8L, 19L, 13L), `0610009O20Rik` = c(187L,
77L, 86L, 37L, 81L, 24L, 83L, 57L, 98L, 83L, 113L, 48L), `0610010B08Rik` = c(16L,
3L, 6L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 1L)), .Names = c("treatment",
"0610005C13Rik", "0610007P14Rik", "0610009B22Rik", "0610009L18Rik",
"0610009O20Rik", "0610010B08Rik"), row.names = c(NA, -12L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "treatment", drop = TRUE, indices = list(
0:5, 6:11), group_sizes = c(6L, 6L), biggest_group_size = 6L, labels = structure(list(
treatment = c("control", "treated")), row.names = c(NA, -2L
), class = "data.frame", vars = "treatment", drop = TRUE, .Names = "treatment"))
看起来像这样:
Source: local data frame [12 x 7]
Groups: treatment [2]
treatment `0610005C13Rik` `0610007P14Rik` `0610009B22Rik` `0610009L18Rik` `0610009O20Rik` `0610010B08Rik`
<chr> <int> <int> <int> <int> <int> <int>
1 control 5 300 251 42 187 16
2 control 2 249 158 17 77 3
3 control 2 166 92 16 86 6
4 control 5 104 82 17 37 3
5 control 1 248 239 10 81 2
6 control 0 136 107 6 24 3
7 treated 6 164 147 18 83 3
8 treated 1 121 96 1 57 2
9 treated 0 191 153 15 98 3
10 treated 5 187 200 8 83 2
11 treated 1 289 211 19 113 3
12 treated 2 169 80 13 48 1
我想要做的是根据分组mean
计算treatment
和 http://codepen.io/gsound/ 。简历基本上是 mean / sd
sd / mean
。最终的预期结果如下:
gene_symbol control.mean treated.mean control.cv treated.cv
0610005C13Rik 2.5000 2.500000 0.829457 ...
0610007P14Rik 200.5000 186.833333 ... ...
... etc ...
我怎么能用dplyr做到这一点?
答案 0 :(得分:2)
我们可以gather
然后获取mean/sd
library(tidyverse)
df %>%
gather(gene_symbol, Val, -treatment) %>%
group_by(treatment, gene_symbol) %>%
summarise(Mean = mean(Val), cv = sd(Val)/mean(Val)) %>%
gather(Var1, Val, -treatment,-gene_symbol) %>%
unite(new, treatment, Var1) %>%
spread(new, Val)
# A tibble: 6 × 5
# gene_symbol control_cv control_Mean treated_cv treated_Mean
#* <chr> <dbl> <dbl> <dbl> <dbl>
#1 0610005C13Rik 0.8294577 2.5000 0.9715966 2.500000
#2 0610007P14Rik 0.3809605 200.5000 0.2992429 186.833333
#3 0610009B22Rik 0.4823019 154.8333 0.3582799 147.833333
#4 0610009L18Rik 0.6983225 18.0000 0.5515103 12.333333
#5 0610009O20Rik 0.6996217 82.0000 0.3040676 80.333333
#6 0610010B08Rik 0.9672317 5.5000 0.3499271 2.333333
或另一种选择是将mean
,cv
与summarise_all
匹配,然后重塑为“长”格式并将其重新转换回“广泛”
df %>%
group_by(treatment) %>%
summarise_all(funs(mean = mean(.), cv = sd(.)/mean(.))) %>%
gather(Var, Val, -treatment) %>%
separate(Var, into = c('gene_symbol', 'Var2')) %>%
unite(new, treatment, Var2) %>%
spread(new, Val)
# A tibble: 6 × 5
# gene_symbol control_cv control_mean treated_cv treated_mean
#* <chr> <dbl> <dbl> <dbl> <dbl>
#1 0610005C13Rik 0.8294577 2.5000 0.9715966 2.500000
#2 0610007P14Rik 0.3809605 200.5000 0.2992429 186.833333
#3 0610009B22Rik 0.4823019 154.8333 0.3582799 147.833333
#4 0610009L18Rik 0.6983225 18.0000 0.5515103 12.333333
#5 0610009O20Rik 0.6996217 82.0000 0.3040676 80.333333
#6 0610010B08Rik 0.9672317 5.5000 0.3499271 2.333333
或者我们可以使用melt/dcast
data.table
执行此操作
library(data.table)
dcast(melt(setDT(df), id.var = "treatment", variable.name = "gene_symbol"
)[, .(mean = mean(value), cv = sd(value)/mean(value)), .(treatment, gene_symbol)
], gene_symbol~treatment, value.var = c('mean', 'cv'))
# gene_symbol mean_control mean_treated cv_control cv_treated
#1: 0610005C13Rik 2.5000 2.500000 0.8294577 0.9715966
#2: 0610007P14Rik 200.5000 186.833333 0.3809605 0.2992429
#3: 0610009B22Rik 154.8333 147.833333 0.4823019 0.3582799
#4: 0610009L18Rik 18.0000 12.333333 0.6983225 0.5515103
#5: 0610009O20Rik 82.0000 80.333333 0.6996217 0.3040676
#6: 0610010B08Rik 5.5000 2.333333 0.9672317 0.3499271
编辑:反映OP公式中的变化
答案 1 :(得分:1)
这是一种使用连接的方法
library("tidyverse")
df %>% gather(key = gene_symbol, value = value,-treatment) %>%
group_by(treatment, gene_symbol) %>%
summarise(mean = mean(value), cv = mean / sd(value)) %>%
ungroup() %>%
left_join(
x = filter(., treatment == "control"),
y = filter(., treatment == "treated"),
by = "gene_symbol",
suffix = c(".control", ".treated")
) %>%
select(-starts_with("treatment"))