R - 多个数据帧中的分类变量的频率

时间:2017-10-11 16:29:12

标签: r count frequency

我有18个数据帧,它们看起来像这样:

我需要通过第7列并在其余17个数据帧中计算其频率。我尝试过不同的方法,比如来自ddply的count函数,但我不确定它们是否适用于多个数据帧。

我想要的输出是所有数据帧中每个基因的频率(每个基因在数据帧中只出现一次或更少):

dput的输出:

> dput(head(df1, 20))
structure(list(V1 = 1:20, V2 = structure(c(16L, 14L, 13L, 32L, 
18L, 28L, 1L, 2L, 31L, 25L, 15L, 5L, 23L, 24L, 35L, 7L, 9L, 20L, 
29L, 10L), .Label = c("BMPR1B", "C9", "CACNB2", "CLEC12A", "DEFB126", 
"DYNC2H1", "EDDM3A", "FAM47C", "FZD8", "GPR1", "GSDMC", "H2AFY2", 
"HOMER1", "HSPB3", "HTR7", "IFNA1", "IFNA6", "IL1RL1", "INSL5", 
"LIMCH1", "LINC00478", "LRCH2", "MAGEH1", "MGP", "MMP8", "NFIB", 
"PDE1A", "PLSCR2", "PPP1R42", "PRKY", "PTENP1", "RGAG1", "SSBP2", 
"STEAP4", "SV2C", "TLR2", "TMSB15A", "TOX"), class = "factor"), 
    V3 = structure(c(16L, 14L, 13L, 32L, 18L, 28L, 1L, 2L, 31L, 
    25L, 15L, 5L, 23L, 24L, 35L, 7L, 9L, 20L, 29L, 10L), .Label = c("BMPR1B", 
    "C9", "CACNB2", "CLEC12A", "DEFB126", "DYNC2H1", "EDDM3A", 
    "FAM47C", "FZD8", "GPR1", "GSDMC", "H2AFY2", "HOMER1", "HSPB3", 
    "HTR7", "IFNA1", "IFNA6", "IL1RL1", "INSL5", "LIMCH1", "LINC00478", 
    "LRCH2", "MAGEH1", "MGP", "MMP8", "NFIB", "PDE1A", "PLSCR2", 
    "PPP1R42", "PRKY", "PTENP1", "RGAG1", "SSBP2", "STEAP4", 
    "SV2C", "TLR2", "TMSB15A", "TOX"), class = "factor"), V4 = structure(c(16L, 
    14L, 13L, 32L, 18L, 28L, 1L, 2L, 31L, 25L, 15L, 5L, 23L, 
    24L, 35L, 7L, 9L, 20L, 29L, 10L), .Label = c("BMPR1B", "C9", 
    "CACNB2", "CLEC12A", "DEFB126", "DYNC2H1", "EDDM3A", "FAM47C", 
    "FZD8", "GPR1", "GSDMC", "H2AFY2", "HOMER1", "HSPB3", "HTR7", 
    "IFNA1", "IFNA6", "IL1RL1", "INSL5", "LIMCH1", "LINC00478", 
    "LRCH2", "MAGEH1", "MGP", "MMP8", "NFIB", "PDE1A", "PLSCR2", 
    "PPP1R42", "PRKY", "PTENP1", "RGAG1", "SSBP2", "STEAP4", 
    "SV2C", "TLR2", "TMSB15A", "TOX"), class = "factor"), V5 = c(46L, 
    49L, 90L, 93L, 99L, 150L, 272L, 373L, 472L, 478L, 489L, 540L, 
    661L, 855L, 889L, 947L, 971L, 1002L, 1007L, 1080L), V6 = c(0.732, 
    0.717, 0.635, 0.633, 0.624, 0.545, 0.449, 0.374, 0.3, 0.295, 
    0.284, 0.244, 0.16, 0.0366, 0.0121, -0.0197, -0.0334, -0.0546, 
    -0.056, -0.0982), V7 = c(0.0345, 0.0903, 0.12, 0.169, 0.216, 
    0.233, 0.205, 0.183, 0.155, 0.176, 0.193, 0.186, 0.136, 0.0379, 
    0.0215, -0.00676, -0.0162, -0.0275, -0.0252, -0.0551), V8 = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), .Label = c("NO", "YES"), class = "factor")), .Names = c("V1", 
"V2", "V3", "V4", "V5", "V6", "V7", "V8"), row.names = c(NA, 
20L), class = "data.frame")
> dput(head(df2, 20))
structure(list(V1 = 1:20, V2 = structure(c(5L, 16L, 9L, 11L, 
12L, 13L, 6L, 10L, 20L, 14L, 3L, 8L, 17L, 4L, 1L, 21L, 15L, 18L, 
7L, 22L), .Label = c("ACAT2", "ADRA2A", "AKR1B1", "BRCA2", "CCNE2", 
"CDC6", "CDO1", "CYP2C9", "EZH2", "GSG2", "HAUS8", "HIST2H2BF", 
"HMGN2", "KIF18A", "MEIS2", "ORC6", "OTC", "PECR", "REG1A", "SKA1", 
"SYCP3", "TOX"), class = "factor"), V3 = structure(c(5L, 16L, 
9L, 11L, 12L, 13L, 6L, 10L, 20L, 14L, 3L, 8L, 17L, 4L, 1L, 21L, 
15L, 18L, 7L, 22L), .Label = c("ACAT2", "ADRA2A", "AKR1B1", "BRCA2", 
"CCNE2", "CDC6", "CDO1", "CYP2C9", "EZH2", "GSG2", "HAUS8", "HIST2H2BF", 
"HMGN2", "KIF18A", "MEIS2", "ORC6", "OTC", "PECR", "REG1A", "SKA1", 
"SYCP3", "TOX"), class = "factor"), V4 = structure(c(5L, 16L, 
9L, 11L, 12L, 13L, 6L, 10L, 20L, 14L, 3L, 8L, 17L, 4L, 1L, 21L, 
15L, 18L, 7L, 22L), .Label = c("ACAT2", "ADRA2A", "AKR1B1", "BRCA2", 
"CCNE2", "CDC6", "CDO1", "CYP2C9", "EZH2", "GSG2", "HAUS8", "HIST2H2BF", 
"HMGN2", "KIF18A", "MEIS2", "ORC6", "OTC", "PECR", "REG1A", "SKA1", 
"SYCP3", "TOX"), class = "factor"), V5 = c(31L, 68L, 145L, 170L, 
204L, 222L, 235L, 279L, 355L, 556L, 646L, 726L, 789L, 807L, 954L, 
1396L, 1399L, 1639L, 1711L, 1776L), V6 = c(0.774, 0.681, 0.55, 
0.528, 0.5, 0.488, 0.478, 0.443, 0.387, 0.23, 0.17, 0.12, 0.0757, 
0.0619, -0.0229, -0.279, -0.281, -0.418, -0.452, -0.491), V7 = c(0.0747, 
0.135, 0.16, 0.209, 0.25, 0.298, 0.348, 0.377, 0.383, 0.306, 
0.28, 0.253, 0.229, 0.228, 0.155, -0.042, -0.0103, -0.0857, -0.07, 
-0.046), V8 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("NO", 
"YES"), class = "factor")), .Names = c("V1", "V2", "V3", "V4", 
"V5", "V6", "V7", "V8"), row.names = c(NA, 20L), class = "data.frame")

2 个答案:

答案 0 :(得分:3)

好。首先,让我们将您的数据框放在一个简单易用的列表中:

data_list = list(df1, df2)

我们要从列表中的每个数据框中提取“V2”列,以制作值的表。我们需要将它们全部从factor类转换为character类,以便在计数开始之前将它们组合起来:

# extract column named V2
res = lapply(data_list, '[[', 'V2')
# convert to character
res = lapply(res, as.character)
# make a frequency table
res = table(unlist(res))
res
  #   ACAT2    AKR1B1    BMPR1B     BRCA2        C9     CCNE2      CDC6      CDO1    CYP2C9 
  #       1         1         1         1         1         1         1         1         1 
  # DEFB126    EDDM3A      EZH2      FZD8      GPR1      GSG2     HAUS8 HIST2H2BF     HMGN2 
  #       1         1         1         1         1         1         1         1         1 
  #  HOMER1     HSPB3      HTR7     IFNA1    IL1RL1    KIF18A    LIMCH1    MAGEH1     MEIS2 
  #       1         1         1         1         1         1         1         1         1 
  #     MGP      MMP8      ORC6       OTC      PECR    PLSCR2   PPP1R42    PTENP1     RGAG1 
  #       1         1         1         1         1         1         1         1         1 
  #    SKA1      SV2C     SYCP3       TOX 
  #       1         1         1         1 

“但是将所有数据框放入列表的好方法是什么?”这是非常彻底的回答in this answer.

答案 1 :(得分:1)

我的方法与格雷戈尔不同。如果我正确理解了问题,请尝试以下操作。

count_genes <- function(DF1, DF_list){
    genes <- lapply(DF_list, `[[`, 2)
    genes <- unlist(lapply(genes, as.character))
    res <- sapply(DF1[[2]], function(x) sum(genes %in% as.character(x)))
    names(res) <- DF1[[2]]
    res
}

df_list <- list(df2)

count_genes(df1, df_list)