每个子集

时间:2017-02-23 15:28:46

标签: r

以下是要重现的数据帧的最小示例。

df <- structure(list(Gene = structure(c(147L, 147L, 148L, 148L, 148L, 
87L, 87L, 87L, 87L, 87L), .Label = c("genome", "k141_1189_101", 
"k141_1189_104", "k141_1189_105", "k141_1189_116", "k141_1189_13", 
"k141_1189_14", "k141_1189_146", "k141_1189_150", "k141_1189_18", 
"k141_1189_190", "k141_1189_194", "k141_1189_215", "k141_1189_248", 
"k141_1189_251", "k141_1189_252", "k141_1189_259", "k141_1189_274", 
"k141_1189_283", "k141_1189_308", "k141_1189_314", "k141_1189_322", 
"k141_1189_353", "k141_1189_356", "k141_1189_372", "k141_1189_373", 
"k141_1189_43", "k141_1189_45", "k141_1189_72", "k141_1597_15", 
"k141_1597_18", "k141_1597_23", "k141_1597_41", "k141_1597_55", 
"k141_1597_66", "k141_1597_67", "k141_1597_68", "k141_1597_69", 
"k141_2409_34", "k141_2409_8", "k141_3390_69", "k141_3390_83", 
"k141_3390_84", "k141_3726_25", "k141_3726_31", "k141_3726_49", 
"k141_3726_50", "k141_3726_62", "k141_3726_8", "k141_3726_80", 
"k141_3790_1", "k141_3993_114", "k141_3993_122", "k141_3993_162", 
"k141_3993_172", "k141_3993_183", "k141_3993_186", "k141_3993_188", 
"k141_3993_24", "k141_3993_25", "k141_3993_28", "k141_3993_32", 
"k141_3993_44", "k141_3993_47", "k141_3993_53", "k141_3993_57", 
"k141_3993_68", "k141_4255_80", "k141_4255_81", "k141_4255_87", 
"k141_5079_107", "k141_5079_110", "k141_5079_130", "k141_5079_14", 
"k141_5079_141", "k141_5079_16", "k141_5079_184", "k141_5079_185", 
"k141_5079_202", "k141_5079_24", "k141_5079_39", "k141_5079_63", 
"k141_5079_65", "k141_5079_70", "k141_5079_77", "k141_5079_87", 
"k141_5079_9", "k141_5313_16", "k141_5313_17", "k141_5313_20", 
"k141_5313_23", "k141_5313_39", "k141_5313_5", "k141_5313_51", 
"k141_5313_52", "k141_5313_78", "k141_5545_101", "k141_5545_103", 
"k141_5545_104", "k141_5545_105", "k141_5545_106", "k141_5545_107", 
"k141_5545_108", "k141_5545_109", "k141_5545_110", "k141_5545_111", 
"k141_5545_112", "k141_5545_113", "k141_5545_114", "k141_5545_119", 
"k141_5545_128", "k141_5545_130", "k141_5545_139", "k141_5545_141", 
"k141_5545_145", "k141_5545_16", "k141_5545_169", "k141_5545_17", 
"k141_5545_172", "k141_5545_6", "k141_5545_60", "k141_5545_62", 
"k141_5545_63", "k141_5545_86", "k141_5545_87", "k141_5545_88", 
"k141_5545_89", "k141_5545_91", "k141_5545_92", "k141_5545_93", 
"k141_5545_94", "k141_5545_96", "k141_5545_97", "k141_5545_98", 
"k141_5545_99", "k141_5734_13", "k141_5734_2", "k141_5734_4", 
"k141_5734_5", "k141_5734_6", "k141_6014_124", "k141_6014_2", 
"k141_6014_34", "k141_6014_75", "k141_6014_96", "k141_908_14", 
"k141_908_2", "k141_908_5", "k141_957_126", "k141_957_135", "k141_957_136", 
"k141_957_14", "k141_957_140", "k141_957_141", "k141_957_148", 
"k141_957_179", "k141_957_191", "k141_957_35", "k141_957_47", 
"k141_957_55", "k141_957_57", "k141_957_59", "k141_957_6", "k141_957_63", 
"k141_957_65", "k141_957_68", "k141_957_77", "k141_957_95"), class = "factor"), 
    depth = c(9L, 10L, 9L, 10L, 11L, 14L, 15L, 16L, 17L, 18L), 
    bases_covered = c(6L, 3L, 4L, 7L, 4L, 59L, 54L, 70L, 34L, 
    17L), gene_length = c(1140L, 1140L, 591L, 591L, 591L, 690L, 
    690L, 690L, 690L, 690L), regioncoverage = c(54L, 30L, 36L, 
    70L, 44L, 826L, 810L, 1120L, 578L, 306L)), .Names = c("Gene", 
"depth", "bases_covered", "gene_length", "regioncoverage"), row.names = c(1L, 
2L, 33L, 34L, 35L, 78L, 79L, 80L, 81L, 82L), class = "data.frame")

数据框如下所示:

      Gene depth bases_covered gene_length regioncoverage
1   k141_908_2     9             6        1140             54
2   k141_908_2    10             3        1140             30
33  k141_908_5     9             4         591             36
34  k141_908_5    10             7         591             70
35  k141_908_5    11             4         591             44
78 k141_5079_9    14            59         690            826
79 k141_5079_9    15            54         690            810
80 k141_5079_9    16            70         690           1120
81 k141_5079_9    17            34         690            578
82 k141_5079_9    18            17         690            306

我想要的是,对于每个基因(例如k141_908_2),我想要对区域覆盖进行求和并除以唯一(基因长度)。事实上,每个基因的基因长度总是相同的。

例如对于Gene K141_908_2,我会这样做:(54 + 30)/ 1140 = 0.07 例如对于Gene K141_908_5,我会这样做:(36 + 70 + 44)/ 591 = 0.25

最终的数据框应报告两列。

     Gene Newcoverage
1   k141_908_2 0.07
2   k141_908_5 0.25
3   ......

等等。

感谢您的帮助

5 个答案:

答案 0 :(得分:2)

使用VerticalAlignment="{TemplateBinding VerticalAlignment}" HorizontalAlignment="{TemplateBinding HorizontalAlignment}"

这很简单
dplyr

答案 1 :(得分:1)

我需要将第一列设置为character而将其他列设置为numeric。但之后你可以通过基因分割df,然后进行必要的计算。

df[,2:5] = lapply(df[,2:5], as.numeric)
df$Gene = as.character(df$Gene)
sapply(split(df, df$Gene), function(x) sum(x[,5]/x[1,4]))
#k141_5079_9  k141_908_2  k141_908_5 
# 5.27536232  0.07368421  0.25380711 

答案 2 :(得分:1)

我们可以使用tidyverse

library(tidyverse)
df %>% 
  group_by(Gene) %>%
  summarise(Newcoverage = sum(regioncoverage)/gene_length[1])
# A tibble: 3 × 2
#          Gene Newcoverage
#        <fctr>       <dbl>
#1 k141_5079_9  5.27536232
#2  k141_908_2  0.07368421
#3  k141_908_5  0.25380711

base R选项

by(df[4:5], list(as.character(df[,'Gene'])), FUN= function(x) sum(x[,2])/x[1,1])

答案 3 :(得分:0)

快速方法

require(data.table)
DT <- setDT(df)  
#just to output unique rows 
DT[, .(New_Coverage = unique(sum(regioncoverage)/gene_length)), by = .(Gene)]

输出

         Gene New_Coverage
1:  k141_908_2   0.07368421
2:  k141_908_5   0.25380711
3: k141_5079_9   5.27536232

答案 4 :(得分:0)

我经常使用dplyr。所以这是一种方式:

library(dplyr)

df %>% 
  group_by(Gene) %>% 
  mutate(Newcoverage=sum(regioncoverage)/unique(gene_length))

如果您只想要每个基因的唯一值:

df %>% 
  group_by(Gene) %>% 
  transmute(Newcoverage=sum(regioncoverage)/unique(gene_length)) %>% 
  unique()