如何在一列中使用共同值对行进行平均

时间:2017-12-22 11:32:37

标签: r

我有数据框 我想对第一个存在的值进行平均行显示

DF1

structure(list(Symbols = c("AAAS", "AACS", "AADAC", "AAGAB", 
"AAGAB", "AAK1", "AAK1", "AAK1", "AAK1", "AAK1", "AAMDC"), Average_Control = c(5.7212099528, 
9.6925693375, 3.1913650495, 7.9479411012, 10.5609967525, 8.2969969243, 
5.3382193495, 1.1836102209, 0.7941625658, 3.8002240701, 2.7307985646
), Glycyrrhizic_acid_rep_1 = c(5.290201, 9.735883, 3.3448757611, 
7.6838303132, 10.0561155597, 8.1006595504, 4.6894686662, 1.478692, 
2.382658, 4.474969, 3.396714), Hydroxysafflor_yellow_A = c(5.574157, 
9.806325, 0.7223951505, 7.8382521567, 10.4195623492, 8.1103481281, 
4.950000706, 1.192986, -0.09879839, 4.585423, 4.151861), Anhydroicaritin =   c(5.464502, 
9.781125, 4.3363375165, 7.8300766195, 10.4435803063, 8.2280380864, 
5.3344613357, 0.9328041, 1.675867, 2.748738, 3.945212), Hyperoside =c(6.126322, 
9.829496, 2.4293884258, 7.7151641411, 10.4487046678, 8.0864510043, 
4.9309392652, 0.5856222, 0.422873, 2.518488, 3.8994)), .Names =  c("Symbols", 
"Average_Control", "Glycyrrhizic_acid_rep_1", "Hydroxysafflor_yellow_A", 
"Anhydroicaritin", "Hyperoside"), row.names = c(NA, -11L), class =  c("tbl_df", 
"tbl", "data.frame"))

输出 Final_Result exprected输出将具有以下数据框

structure(list(Symbols = c("AAAS", "AACS", "AADAC", "AAGAB", 
"AAGAB", "AAGA_Average", "AAK1", "AAK1", "AAK1", "AAK1", "AAK1", 
"AAK1_Average", "AAMDC"), Average_Control = c(5.7212099528, 9.6925693375, 
3.1913650495, 7.9479411012, 10.5609967525, 9.25446892685, 8.2969969243, 
5.3382193495, 1.1836102209, 0.7941625658, 3.8002240701, 3.88264262612, 
2.7307985646), Glycyrrhizic_acid_rep_1 = c(5.290201, 9.735883, 
3.3448757611, 7.6838303132, 10.0561155597, 8.86997293645, 8.1006595504, 
4.6894686662, 1.478692, 2.382658, 4.474969, 4.22528944332, 3.396714
), Hydroxysafflor_yellow_A = c(5.574157, 9.806325, 0.7223951505, 
7.8382521567, 10.4195623492, 9.12890725295, 8.1103481281, 4.950000706, 
1.192986, -0.09879839, 4.585423, 3.74799188882, 4.151861), Anhydroicaritin = c(5.464502, 
9.781125, 4.3363375165, 7.8300766195, 10.4435803063, 9.1368284629, 
8.2280380864, 5.3344613357, 0.9328041, 1.675867, 2.748738, 3.78398170442, 
3.945212), Hyperoside = c(6.126322, 9.829496, 2.4293884258, 7.7151641411, 
10.4487046678, 9.08193440445, 8.0864510043, 4.9309392652, 0.5856222, 
0.422873, 2.518488, 3.3088746939, 3.8994)), .Names = c("Symbols", 
"Average_Control", "Glycyrrhizic_acid_rep_1", "Hydroxysafflor_yellow_A", 
"Anhydroicaritin", "Hyperoside"), class = c("tbl_df", "tbl", 
"data.frame"), row.names = c(NA, -13L))

2 个答案:

答案 0 :(得分:0)

# example data
df1 = structure(list(Symbols = c("AAAS", "AACS", "AADAC", "AAGAB", "AAGAB", "AAK1", "AAK1", "AAK1", "AAK1", "AAK1", "AAMDC"), 
Average_Control = c(5.7212099528, 9.6925693375, 3.1913650495, 7.9479411012, 10.5609967525, 8.2969969243, 
5.3382193495, 1.1836102209, 0.7941625658, 3.8002240701, 2.7307985646), 
Glycyrrhizic_acid_rep_1 = c(5.290201, 9.735883, 3.3448757611, 
7.6838303132, 10.0561155597, 8.1006595504, 4.6894686662, 1.478692, 2.382658, 4.474969, 3.396714), 
Hydroxysafflor_yellow_A = c(5.574157, 9.806325, 0.7223951505, 7.8382521567, 10.4195623492, 8.1103481281, 
4.950000706, 1.192986, -0.09879839, 4.585423, 4.151861), 
Anhydroicaritin =   c(5.464502, 9.781125, 4.3363375165, 7.8300766195, 10.4435803063, 8.2280380864, 
5.3344613357, 0.9328041, 1.675867, 2.748738, 3.945212), 
Hyperoside =c(6.126322, 9.829496, 2.4293884258, 7.7151641411, 10.4487046678, 8.0864510043, 
4.9309392652, 0.5856222, 0.422873, 2.518488, 3.8994)),
.Names =  c("Symbols", "Average_Control", "Glycyrrhizic_acid_rep_1", "Hydroxysafflor_yellow_A", "Anhydroicaritin", "Hyperoside"), 
row.names = c(NA, -11L), class =  c("tbl_df", "tbl", "data.frame"))

library(dplyr)

df1 %>%
  group_by(Symbols) %>%                              # for each Symbol value
  filter(n() > 1) %>%                                # count number of rows and keep only those with multipl rows
  summarise_all(mean) %>%                            # get average for all columns
  mutate(Symbols = paste0(Symbols, "_Average")) %>%  # update Symbol values
  rbind(df1)                                         # bind rows with original dataset

# # A tibble: 13 x 6
#         Symbols Average_Control Glycyrrhizic_acid_rep_1 Hydroxysafflor_yellow_A Anhydroicaritin Hyperoside
#           <chr>           <dbl>                   <dbl>                   <dbl>           <dbl>      <dbl>
# 1 AAGAB_Average       9.2544689                8.869973              9.12890725       9.1368285  9.0819344
# 2  AAK1_Average       3.8826426                4.225289              3.74799189       3.7839817  3.3088747
# 3          AAAS       5.7212100                5.290201              5.57415700       5.4645020  6.1263220
# 4          AACS       9.6925693                9.735883              9.80632500       9.7811250  9.8294960
# 5         AADAC       3.1913650                3.344876              0.72239515       4.3363375  2.4293884
# 6         AAGAB       7.9479411                7.683830              7.83825216       7.8300766  7.7151641
# 7         AAGAB      10.5609968               10.056116             10.41956235      10.4435803 10.4487047
# 8          AAK1       8.2969969                8.100660              8.11034813       8.2280381  8.0864510
# 9          AAK1       5.3382193                4.689469              4.95000071       5.3344613  4.9309393
# 10         AAK1       1.1836102                1.478692              1.19298600       0.9328041  0.5856222
# 11         AAK1       0.7941626                2.382658             -0.09879839       1.6758670  0.4228730
# 12         AAK1       3.8002241                4.474969              4.58542300       2.7487380  2.5184880
# 13        AAMDC       2.7307986                3.396714              4.15186100       3.9452120  3.8994000

答案 1 :(得分:0)

没有dplyr,你可以这样做:

df1 = structure(list(Symbols = c("AAAS", "AACS", "AADAC", "AAGAB", "AAGAB", "AAK1", "AAK1", "AAK1", "AAK1", "AAK1", "AAMDC"), 
Average_Control = c(5.7212099528, 9.6925693375, 3.1913650495, 7.9479411012, 10.5609967525, 8.2969969243, 
5.3382193495, 1.1836102209, 0.7941625658, 3.8002240701, 2.7307985646), 
Glycyrrhizic_acid_rep_1 = c(5.290201, 9.735883, 3.3448757611, 
7.6838303132, 10.0561155597, 8.1006595504, 4.6894686662, 1.478692, 2.382658, 4.474969, 3.396714), 
Hydroxysafflor_yellow_A = c(5.574157, 9.806325, 0.7223951505, 7.8382521567, 10.4195623492, 8.1103481281, 
4.950000706, 1.192986, -0.09879839, 4.585423, 4.151861), 
Anhydroicaritin =   c(5.464502, 9.781125, 4.3363375165, 7.8300766195, 10.4435803063, 8.2280380864, 
5.3344613357, 0.9328041, 1.675867, 2.748738, 3.945212), 
Hyperoside =c(6.126322, 9.829496, 2.4293884258, 7.7151641411, 10.4487046678, 8.0864510043, 
4.9309392652, 0.5856222, 0.422873, 2.518488, 3.8994)),
.Names =  c("Symbols", "Average_Control", "Glycyrrhizic_acid_rep_1", "Hydroxysafflor_yellow_A", "Anhydroicaritin", "Hyperoside"), 
row.names = c(NA, -11L), class =  c("tbl_df", "tbl", "data.frame"))

#select rows w/ symbols that occur > 1 times & aggregate by mean
agg <- aggregate(.~Symbols, FUN = mean, 
             data = a1[a1$Symbols %in% unique(a1$Symbols[duplicated(a1$Symbols)]), ])


agg$Symbols <- paste(agg$Symbols, "_Average")

a3 <- rbind(agg, df1)

# change order and row index to make it equal to your output
a3 <- a3[order(a3$Symbols), ]

row.names(a3) <- seq(1:nrow(a3))

> a3
          Symbols Average_Control Glycyrrhizic_acid_rep_1 Hydroxysafflor_yellow_A Anhydroicaritin Hyperoside
1            AAAS       5.7212100                5.290201              5.57415700       5.4645020  6.1263220
2            AACS       9.6925693                9.735883              9.80632500       9.7811250  9.8294960
3           AADAC       3.1913650                3.344876              0.72239515       4.3363375  2.4293884
4           AAGAB       7.9479411                7.683830              7.83825216       7.8300766  7.7151641
5           AAGAB      10.5609968               10.056116             10.41956235      10.4435803 10.4487047
6  AAGAB _Average       9.2544689                8.869973              9.12890725       9.1368285  9.0819344
7            AAK1       8.2969969                8.100660              8.11034813       8.2280381  8.0864510
8            AAK1       5.3382193                4.689469              4.95000071       5.3344613  4.9309393
9            AAK1       1.1836102                1.478692              1.19298600       0.9328041  0.5856222
10           AAK1       0.7941626                2.382658             -0.09879839       1.6758670  0.4228730
11           AAK1       3.8002241                4.474969              4.58542300       2.7487380  2.5184880
12  AAK1 _Average       3.8826426                4.225289              3.74799189       3.7839817  3.3088747
13          AAMDC       2.7307986                3.396714              4.15186100       3.9452120  3.8994000