对于两个其他变量与dplyr的每个唯一组合,仅对分组数据框中的变量求和一次

时间:2019-04-20 00:01:43

标签: r group-by dplyr tidyverse mutate

我有一张长桌,上面有areacluster的重复组合。

counts <-  tibble::tribble(
         ~age,         ~area,          ~cluster, ~norm.to.area,
      "gw_25",   "cingulate",       "cluster_1",          0.03,
      "gw_20",   "cingulate",       "cluster_1",          0.03,
      "gw_18", "hippocampus",       "cluster_1",          0.02,
      "gw_25",      "insula",       "cluster_1",          0.01,
      "gw_20",       "motor",       "cluster_1",          0.01,
      "gw_22",       "motor",       "cluster_1",          0.01,
      "gw_25",       "motor",       "cluster_1",          0.01,
      "gw_14",       "motor",       "cluster_1",          0.01,
      "gw_18",       "motor",       "cluster_1",          0.01,
      "gw_19",       "motor",       "cluster_1",          0.01,
      "gw_17",       "motor",       "cluster_1",          0.01,
      "gw_20",   "occipital",       "cluster_1",          0.01,
      "gw_17",   "occipital",       "cluster_1",          0.01,
      "gw_18",   "occipital",       "cluster_1",          0.01,
      "gw_19",   "occipital",       "cluster_1",          0.01,
      "gw_22",   "occipital",       "cluster_1",          0.01,
      "gw_14",   "occipital",       "cluster_1",          0.01,
      "gw_22",    "parietal",       "cluster_1",             0,
      "gw_25",    "parietal",       "cluster_1",             0,
      "gw_17",    "parietal",       "cluster_1",             0,
      "gw_19",    "parietal",       "cluster_1",             0,
      "gw_20",    "parietal",       "cluster_1",             0,
      "gw_20",         "PFC",       "cluster_1",          0.01,
      "gw_22",         "PFC",       "cluster_1",          0.01,
      "gw_25",         "PFC",       "cluster_1",          0.01
      )

我想创建一个新变量sum.norm.to.area,它是每个norm.to.area的{​​{1}}的总和,仅使用cluster的值对每个组合进行一次norm.to.area

我尝试过area / subcluster.merge group_by,但这会根据给定组合出现的次数对这些值求和。

cluster

感谢您的建议。

更新1:

尝试使用汇总,如下所示,但是会发生同样的事情(当然,除了没有添加为新列):

counts %>% group_by(cluster) %>% mutate(sum.norm.to.area = sum(norm.to.area)

> counts %>% group_by(subcluster.merge, area) %>% summarize(sum(norm.to.area))

更新2

这是我想要的输出,但是我到达它的方式太复杂了。我想找到一种更简单的使用mutate的方式,而不必使用 tibble::tribble( ~cluster . , ~area, ~sum.norm.to.area., "cluster_1", "PFC", 0.06, "cluster_1", "somatosensory", 0.05, "cluster_1", "motor", 0.07, "cluster_1", "parietal", 0, "cluster_1", "temporal", 0.03, "cluster_1", "occipital", 0.06, "cluster_1", "hippocampus", 0.02, "cluster_1", "insula", 0.01, "cluster_1", "cingulate", 0.06, "cluster_10-34", "PFC", 0.42, "cluster_10-34", "somatosensory", 0.35, "cluster_10-34", "motor", 0.48, "cluster_10-34", "parietal", 0.36, "cluster_10-34", "temporal", 0.28, "cluster_10-34", "occipital", 0.4, "cluster_10-34", "hippocampus", 0.12, "cluster_10-34", "insula", 0, "cluster_10-34", "cingulate", 0, "cluster_11", "PFC", 0.18, "cluster_11", "somatosensory", 0.15, "cluster_11", "motor", 0.14, "cluster_11", "parietal", 0.12, "cluster_11", "temporal", 0.04, "cluster_11", "occipital", 0.18, "cluster_11", "hippocampus", 0.02 )

join

所需的输出: > tmp <- counts %>% distinct(area, cluster, .keep_all = TRUE) %>% add_count(cluster, wt = norm.to.area, name = "sum.norm.to.area") counts %>% left_join(tmp, by = c("cluster", "area")) 是为sum.norm.to.areanorm.to.area的所有唯一组合添加area(仅一次)的结果:

cluster

2 个答案:

答案 0 :(得分:1)

使用dplyr,我们可以group_by clustersum仅在每个area中使用唯一值。

library(dplyr)

counts %>%
   group_by(cluster) %>%
   mutate(sum.norm = sum(norm.to.area[!duplicated(area)]))


#   age   area        cluster   norm.to.area sum.norm
#   <chr> <chr>       <chr>            <dbl>    <dbl>
# 1 gw_25 cingulate   cluster_1         0.03     0.09
# 2 gw_20 cingulate   cluster_1         0.03     0.09
# 3 gw_18 hippocampus cluster_1         0.02     0.09
# 4 gw_25 insula      cluster_1         0.01     0.09
# 5 gw_20 motor       cluster_1         0.01     0.09
# 6 gw_22 motor       cluster_1         0.01     0.09
# 7 gw_25 motor       cluster_1         0.01     0.09
# 8 gw_14 motor       cluster_1         0.01     0.09
# 9 gw_18 motor       cluster_1         0.01     0.09
#10 gw_19 motor       cluster_1         0.01     0.09
# … with 15 more rows

答案 1 :(得分:0)

我想您不是在寻找mutate()

counts %>% group_by(cluster, area) %>% summarize(sum.norm.to.area = sum(norm.to.area))