用dplyr总结只输出一个值

时间:2016-05-06 11:47:39

标签: r dplyr

我有一个像这样的数据集:

> dput(data_melt)
structure(list(Compound = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Compd1", "Compound1"
), class = "factor"), Concentration = structure(c(5L, 1L, 2L, 
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 
3L, 4L, 5L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 5L, 1L, 
2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 
2L, 3L, 4L, 5L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 5L, 
1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 5L, 1L, 2L, 3L, 4L, 5L, 
1L, 2L, 3L, 4L, 5L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 
5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L), .Label = c(".01uM", 
".1uM", "1.0uM", "10uM", "DMSO"), class = "factor"), Co.Agonist = structure(c(1L, 
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 
3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("High/High", 
"High/Low", "Low/High"), class = "factor"), variable = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), class = "factor", .Label = c("Tau1", 
"Tau2", "Peak.Mean", "Area.Mean", "SS1.Mean")), value = c(1.38196479885153, 
1.14405508500302, 0.988961970528585, 1.44849126088629, 0.492462666110073, 
NA, 2.17712741416582, 1.67028746892543, 1.96489211952819, 1.2460774224718, 
NA, 0.485501088636007, 0.580120526488655, 0.530538989313623, 
0.884536240505712, 0.322958355856638, 0.740882962734369, 1.18088839355135, 
1.48745421674211, 1.16792544841743, 1.11656132754921, 1.14457816659658, 
0.0675070264176897, 0.176054869732887, 0.174862277854592, 0.200470189214318, 
0.187717771153427, 0.181176140081454, 0.117339926372974, 0.0941816692818621, 
0.156408537242293, 0.171156092362873, 0.0642141717879837, 0.107013341555486, 
0.0892122245482354, 0.151976744172333, 0.198474636073771, 0.188703600586299, 
0.10970902239241, 0.117358989261514, 0.100312892958432, 0.118208485589655, 
0.154895187369863, 0.101035151359696, -3926.26508451201, -696.475731092535, 
-4384.77847338655, -718.718487256701, -3164.8941685203, -818.006663108841, 
-4658.25223372398, -826.496302684798, -2416.89272653148, -2558.96929067338, 
-672.257745869921, -3996.85447223941, -706.5215296652, -4190.52281192937, 
-726.870892539311, -2852.22943401345, -831.88857277573, -4580.4780146496, 
-816.712564805672, -6189.10619924791, -2540.95473989213, -713.422629648631, 
7712.37025286162, 1906.38208801373, 7583.22998649368, 1879.68741296455, 
7424.47445663593, 2236.40541039894, 6178.69685860507, 2064.41869983299, 
5953.06397562968, 6373.90332689516, 1813.36551434687, 4585.08608292281, 
1524.44544360278, 4343.20955707026, 1547.28354007935, 4772.3012092321, 
1889.5819203618, 4251.33850498831, 1687.78145119834, 5978.28926211454, 
3879.07376129486, 1533.61842684178, -622.178041494169, -301.118488704851, 
-618.132026278872, -302.614140229218, -821.035687044046, -383.60893819189, 
-683.302506820162, -331.474546574133, -764.139865695781, -546.931098421476, 
-300.006976301825, -622.178041494169, -301.118488704851, -618.132026278872, 
-302.614140229218, -821.035687044046, -383.60893819189, -683.302506820162, 
-331.474546574133, -764.139865695781, -546.931098421476, -300.006976301825
)), .Names = c("Compound", "Concentration", "Co.Agonist", "variable", 
"value"), row.names = c(NA, -110L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000000000120788>)
> 

我的目标是通过前面的列(化合物名称,浓度,Co.Agonist和变量)汇总值列。

我试过这个:

DatAgg = data_melt %>% group_by(Concentration,Co.Agonist,variable)%>%
  summarise(mean=mean(data_melt$value,na.rm=TRUE),sd=sd(data_melt$value))

但输出不会将data.frame折叠为唯一的组合,并且只返回所有变量的一个值。

我希望使用这个dplyr解决方案来获得group_by变量的每个唯一组合的mean和sd。

1 个答案:

答案 0 :(得分:3)

语法应为

 data_melt %>%
    group_by(Concentration,Co.Agonist,variable) %>% 
    summarise(Mean = mean(value, na.rm=TRUE),
              Sd = sd(value, na.rm=TRUE))

在OP的代码中,在按变量分组后,它取整个“值”列(mean)的mean(data_melt$value, na.rm=TRUE)。因此,将有一个唯一的值。

由于'data_melt'是data.table对象,因此也可以使用data.table方法。

 data_melt[, .(Mean = mean(value, na.rm=TRUE), Sd = sd(value, na.rm=TRUE)),
              by = .(Concentration, Co.Agonist, variable)]