从重复测量中汇总数据的有效方法

时间:2018-11-10 03:17:07

标签: r bigdata aggregate

我正在分析来自大型实验(12400个单细胞和23800个基因)的基因表达数据,并且遇到了效率问题。我将在下面编写一个可重现的示例,但是我的问题如下:

我将数据集中的小鼠基因转换为人类对应基因,以便能够与以前发布的其他数据进行比较。在某些情况下,存在多个匹配项(一个人类基因被映射到多个小鼠基因上)。在这些情况下,我想对这些多个基因的表达值取平均值,并为人类遗传对应物提出一个表达值。我可以通过将表达式数据转换为矩阵格式(允许重复的行名)并应用aggregate()函数来实现此目的,但是要花费大量时间才能浏览大型数据集。在这里很难举例说明确切的情况,但是我的模拟分析管道如下:

data <- as.matrix(data.frame(cell1 = c(1,1,1,1,3,3),
                          cell2 = c(1, 2 ,4 ,10,5,10),
                          cell3 = c(0,0,0,1,10,20),
                          cell4 = c(1,3,4,4,20,20)))

# Adding gene names as rownames
rownames(data) <- c("ABC1", "ABC2", "ABC2", "ABC4", "ABC5", "ABC5")



# Mock gene expression matrix
# Columns indicate expression values from individual cells
# Rows indicate genes 
data
#>      cell1 cell2 cell3 cell4
#> ABC1     1     1     0     1
#> ABC2     1     2     0     3
#> ABC2     1     4     0     4
#> ABC4     1    10     1     4
#> ABC5     3     5    10    20
#> ABC5     3    10    20    20



# Averaging gene expression values where there are multiple measurements for the same gene
aggr_data <- aggregate(data, by=list(rownames(data)), mean)

# End result I'm trying to achieve
aggr_data
#>   Group.1 cell1 cell2 cell3 cell4
#> 1    ABC1     1   1.0     0   1.0
#> 2    ABC2     1   3.0     0   3.5
#> 3    ABC4     1  10.0     1   4.0
#> 4    ABC5     3   7.5    15  20.0

是否有更有效的方法?

感谢您的回答!

2 个答案:

答案 0 :(得分:2)

您可以尝试dplyrsummarise_allmean()功能可为每个组提供平均每一列。

library(tidyverse) # including dplyr
(df <-
  data_frame(
    cell1 = c(1,1,1,1,3,3),
    cell2 = c(1, 2 ,4 ,10,5,10),
    cell3 = c(0,0,0,1,10,20),
    cell4 = c(1,3,4,4,20,20),
    gene_name = c("ABC1", "ABC2", "ABC2", "ABC4", "ABC5", "ABC5")
  ))
#> # A tibble: 6 x 5
#>   cell1 cell2 cell3 cell4 gene_name
#>   <dbl> <dbl> <dbl> <dbl> <chr>    
#> 1     1     1     0     1 ABC1     
#> 2     1     2     0     3 ABC2     
#> 3     1     4     0     4 ABC2     
#> 4     1    10     1     4 ABC4     
#> 5     3     5    10    20 ABC5     
#> 6     3    10    20    20 ABC5

我刚刚将基因名称添加为另一行。现在,您可以使用group_by()进行分组操作

df %>%
  group_by(gene_name) %>% # for each group
  summarise_all(mean) # calculate mean for all columns
#> # A tibble: 4 x 5
#>   gene_name cell1 cell2 cell3 cell4
#>   <chr>     <dbl> <dbl> <dbl> <dbl>
#> 1 ABC1          1   1       0   1  
#> 2 ABC2          1   3       0   3.5
#> 3 ABC4          1  10       1   4  
#> 4 ABC5          3   7.5    15  20

通常,对于您所处情况下的大数据集,data.table包将是合适的:代码是这样的

setDT(df)[, lapply(.SD, mean), by = gene_name]
#>    gene_name cell1 cell2 cell3 cell4
#> 1:      ABC1     1   1.0     0   1.0
#> 2:      ABC2     1   3.0     0   3.5
#> 3:      ABC4     1  10.0     1   4.0
#> 4:      ABC5     3   7.5    15  20.0

setDT仅用于制作data.table对象。

dplyr与data.table

如果绑定数据集,

df_bench
#># A tibble: 18,000 x 10,001
#>   gene_name cell1 cell2 cell3 cell4 cell5 cell6 cell7
#>   <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 ABC308        1     1     0     1     1     1     0
#> 2 ABC258        1     2     0     3     1     2     0
#> 3 ABC553        1     4     0     4     1     4     0
#> 4 ABC57         1    10     1     4     1    10     1
#> 5 ABC469        3     5    10    20     3     5    10
#> 6 ABC484        3    10    20    20     3    10    20
#> 7 ABC813        1     1     0     1     1     1     0
#> 8 ABC371        1     2     0     3     1     2     0
#> 9 ABC547        1     4     0     4     1     4     0
#>10 ABC171        1    10     1     4     1    10     1
#># ... with 17,990 more rows, and 9,993 more variables:
#>#   cell8 <dbl>, cell9 <dbl>, cell10 <dbl>,
#>#   cell11 <dbl>, cell12 <dbl>, cell13 <dbl>,
#>#   cell14 <dbl>, cell15 <dbl>, cell16 <dbl>,
#>#   cell17 <dbl>, cell18 <dbl>, cell19 <dbl>,
#>#   cell20 <dbl>, cell21 <dbl>, cell22 <dbl>,
#>#   cell23 <dbl>, cell24 <dbl>, cell25 <dbl>,
#>#   cell26 <dbl>, cell27 <dbl>, cell28 <dbl>,
#>#   cell29 <dbl>, cell30 <dbl>, cell31 <dbl>,
#>#   cell32 <dbl>, cell33 <dbl>, cell34 <dbl>,
#>#   cell35 <dbl>, cell36 <dbl>, cell37 <dbl>,
#>#   cell38 <dbl>, cell39 <dbl>, cell40 <dbl>,
#>#   cell41 <dbl>, cell42 <dbl>, cell43 <dbl>,
#>#   cell44 <dbl>, cell45 <dbl>, cell46 <dbl>,
#>#   cell47 <dbl>, cell48 <dbl>, cell49 <dbl>,
#>#   cell50 <dbl>, cell51 <dbl>, cell52 <dbl>,
#>#   cell53 <dbl>, cell54 <dbl>, cell55 <dbl>,
#>#   cell56 <dbl>, cell57 <dbl>, cell58 <dbl>,
#>#   cell59 <dbl>, cell60 <dbl>, cell61 <dbl>,
#>#   cell62 <dbl>, cell63 <dbl>, cell64 <dbl>,
#>#   cell65 <dbl>, cell66 <dbl>, cell67 <dbl>,
#>#   cell68 <dbl>, cell69 <dbl>, cell70 <dbl>,
#>#   cell71 <dbl>, cell72 <dbl>, cell73 <dbl>,
#>#   cell74 <dbl>, cell75 <dbl>, cell76 <dbl>,
#>#   cell77 <dbl>, cell78 <dbl>, cell79 <dbl>,
#>#   cell80 <dbl>, cell81 <dbl>, cell82 <dbl>,
#>#   cell83 <dbl>, cell84 <dbl>, cell85 <dbl>,
#>#   cell86 <dbl>, cell87 <dbl>, cell88 <dbl>,
#>#   cell89 <dbl>, cell90 <dbl>, cell91 <dbl>,
#>#   cell92 <dbl>, cell93 <dbl>, cell94 <dbl>,
#>#   cell95 <dbl>, cell96 <dbl>, cell97 <dbl>,
#>#   cell98 <dbl>, cell99 <dbl>, cell100 <dbl>,
#>#   cell101 <dbl>, cell102 <dbl>, cell103 <dbl>,
#>#   cell104 <dbl>, cell105 <dbl>, cell106 <dbl>,
#>#   cell107 <dbl>, …

使用此设置

microbenchmark::microbenchmark(
  DPLYR = {
    df_bench %>%
      group_by(gene_name) %>%
      summarise_all(mean)
  },
  DATATABLE = {
    setDT(df_bench)[, lapply(.SD, mean), by = gene_name]
  },
  times = 50
)
#> Unit: seconds
#>       expr      min       lq     mean   median       uq      max neval
#>      DPLYR 32.82307 34.89050 38.10948 37.44543 40.01937 47.67549    50
#>  DATATABLE 12.16752 13.59018 16.09665 14.25976 15.60752 40.30257    50

data.table似乎比这里的dplyr快。

答案 1 :(得分:1)

使用data.table应该可以很好地工作:

library(data.table)
as.data.table(data)[, lapply(.SD, mean), by = .(rownames(data))]
#   rownames cell1 cell2 cell3 cell4
#1:     ABC1     1   1.0     0   1.0
#2:     ABC2     1   3.0     0   3.5
#3:     ABC4     1  10.0     1   4.0
#4:     ABC5     3   7.5    15  20.0

一个快速的SO搜索可以找到一个链接,以加快按组操作的比较(data.table是处理大型数据的最快方法):

Calculate the mean by group