我有18个数据帧,它们看起来像这样:
我需要通过第7列并在其余17个数据帧中计算其频率。我尝试过不同的方法,比如来自ddply的count函数,但我不确定它们是否适用于多个数据帧。
我想要的输出是所有数据帧中每个基因的频率(每个基因在数据帧中只出现一次或更少):
dput的输出:
> dput(head(df1, 20))
structure(list(V1 = 1:20, V2 = structure(c(16L, 14L, 13L, 32L,
18L, 28L, 1L, 2L, 31L, 25L, 15L, 5L, 23L, 24L, 35L, 7L, 9L, 20L,
29L, 10L), .Label = c("BMPR1B", "C9", "CACNB2", "CLEC12A", "DEFB126",
"DYNC2H1", "EDDM3A", "FAM47C", "FZD8", "GPR1", "GSDMC", "H2AFY2",
"HOMER1", "HSPB3", "HTR7", "IFNA1", "IFNA6", "IL1RL1", "INSL5",
"LIMCH1", "LINC00478", "LRCH2", "MAGEH1", "MGP", "MMP8", "NFIB",
"PDE1A", "PLSCR2", "PPP1R42", "PRKY", "PTENP1", "RGAG1", "SSBP2",
"STEAP4", "SV2C", "TLR2", "TMSB15A", "TOX"), class = "factor"),
V3 = structure(c(16L, 14L, 13L, 32L, 18L, 28L, 1L, 2L, 31L,
25L, 15L, 5L, 23L, 24L, 35L, 7L, 9L, 20L, 29L, 10L), .Label = c("BMPR1B",
"C9", "CACNB2", "CLEC12A", "DEFB126", "DYNC2H1", "EDDM3A",
"FAM47C", "FZD8", "GPR1", "GSDMC", "H2AFY2", "HOMER1", "HSPB3",
"HTR7", "IFNA1", "IFNA6", "IL1RL1", "INSL5", "LIMCH1", "LINC00478",
"LRCH2", "MAGEH1", "MGP", "MMP8", "NFIB", "PDE1A", "PLSCR2",
"PPP1R42", "PRKY", "PTENP1", "RGAG1", "SSBP2", "STEAP4",
"SV2C", "TLR2", "TMSB15A", "TOX"), class = "factor"), V4 = structure(c(16L,
14L, 13L, 32L, 18L, 28L, 1L, 2L, 31L, 25L, 15L, 5L, 23L,
24L, 35L, 7L, 9L, 20L, 29L, 10L), .Label = c("BMPR1B", "C9",
"CACNB2", "CLEC12A", "DEFB126", "DYNC2H1", "EDDM3A", "FAM47C",
"FZD8", "GPR1", "GSDMC", "H2AFY2", "HOMER1", "HSPB3", "HTR7",
"IFNA1", "IFNA6", "IL1RL1", "INSL5", "LIMCH1", "LINC00478",
"LRCH2", "MAGEH1", "MGP", "MMP8", "NFIB", "PDE1A", "PLSCR2",
"PPP1R42", "PRKY", "PTENP1", "RGAG1", "SSBP2", "STEAP4",
"SV2C", "TLR2", "TMSB15A", "TOX"), class = "factor"), V5 = c(46L,
49L, 90L, 93L, 99L, 150L, 272L, 373L, 472L, 478L, 489L, 540L,
661L, 855L, 889L, 947L, 971L, 1002L, 1007L, 1080L), V6 = c(0.732,
0.717, 0.635, 0.633, 0.624, 0.545, 0.449, 0.374, 0.3, 0.295,
0.284, 0.244, 0.16, 0.0366, 0.0121, -0.0197, -0.0334, -0.0546,
-0.056, -0.0982), V7 = c(0.0345, 0.0903, 0.12, 0.169, 0.216,
0.233, 0.205, 0.183, 0.155, 0.176, 0.193, 0.186, 0.136, 0.0379,
0.0215, -0.00676, -0.0162, -0.0275, -0.0252, -0.0551), V8 = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("NO", "YES"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8"), row.names = c(NA,
20L), class = "data.frame")
> dput(head(df2, 20))
structure(list(V1 = 1:20, V2 = structure(c(5L, 16L, 9L, 11L,
12L, 13L, 6L, 10L, 20L, 14L, 3L, 8L, 17L, 4L, 1L, 21L, 15L, 18L,
7L, 22L), .Label = c("ACAT2", "ADRA2A", "AKR1B1", "BRCA2", "CCNE2",
"CDC6", "CDO1", "CYP2C9", "EZH2", "GSG2", "HAUS8", "HIST2H2BF",
"HMGN2", "KIF18A", "MEIS2", "ORC6", "OTC", "PECR", "REG1A", "SKA1",
"SYCP3", "TOX"), class = "factor"), V3 = structure(c(5L, 16L,
9L, 11L, 12L, 13L, 6L, 10L, 20L, 14L, 3L, 8L, 17L, 4L, 1L, 21L,
15L, 18L, 7L, 22L), .Label = c("ACAT2", "ADRA2A", "AKR1B1", "BRCA2",
"CCNE2", "CDC6", "CDO1", "CYP2C9", "EZH2", "GSG2", "HAUS8", "HIST2H2BF",
"HMGN2", "KIF18A", "MEIS2", "ORC6", "OTC", "PECR", "REG1A", "SKA1",
"SYCP3", "TOX"), class = "factor"), V4 = structure(c(5L, 16L,
9L, 11L, 12L, 13L, 6L, 10L, 20L, 14L, 3L, 8L, 17L, 4L, 1L, 21L,
15L, 18L, 7L, 22L), .Label = c("ACAT2", "ADRA2A", "AKR1B1", "BRCA2",
"CCNE2", "CDC6", "CDO1", "CYP2C9", "EZH2", "GSG2", "HAUS8", "HIST2H2BF",
"HMGN2", "KIF18A", "MEIS2", "ORC6", "OTC", "PECR", "REG1A", "SKA1",
"SYCP3", "TOX"), class = "factor"), V5 = c(31L, 68L, 145L, 170L,
204L, 222L, 235L, 279L, 355L, 556L, 646L, 726L, 789L, 807L, 954L,
1396L, 1399L, 1639L, 1711L, 1776L), V6 = c(0.774, 0.681, 0.55,
0.528, 0.5, 0.488, 0.478, 0.443, 0.387, 0.23, 0.17, 0.12, 0.0757,
0.0619, -0.0229, -0.279, -0.281, -0.418, -0.452, -0.491), V7 = c(0.0747,
0.135, 0.16, 0.209, 0.25, 0.298, 0.348, 0.377, 0.383, 0.306,
0.28, 0.253, 0.229, 0.228, 0.155, -0.042, -0.0103, -0.0857, -0.07,
-0.046), V8 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("NO",
"YES"), class = "factor")), .Names = c("V1", "V2", "V3", "V4",
"V5", "V6", "V7", "V8"), row.names = c(NA, 20L), class = "data.frame")
答案 0 :(得分:3)
好。首先,让我们将您的数据框放在一个简单易用的列表中:
data_list = list(df1, df2)
我们要从列表中的每个数据框中提取“V2”列,以制作值的表。我们需要将它们全部从factor
类转换为character
类,以便在计数开始之前将它们组合起来:
# extract column named V2
res = lapply(data_list, '[[', 'V2')
# convert to character
res = lapply(res, as.character)
# make a frequency table
res = table(unlist(res))
res
# ACAT2 AKR1B1 BMPR1B BRCA2 C9 CCNE2 CDC6 CDO1 CYP2C9
# 1 1 1 1 1 1 1 1 1
# DEFB126 EDDM3A EZH2 FZD8 GPR1 GSG2 HAUS8 HIST2H2BF HMGN2
# 1 1 1 1 1 1 1 1 1
# HOMER1 HSPB3 HTR7 IFNA1 IL1RL1 KIF18A LIMCH1 MAGEH1 MEIS2
# 1 1 1 1 1 1 1 1 1
# MGP MMP8 ORC6 OTC PECR PLSCR2 PPP1R42 PTENP1 RGAG1
# 1 1 1 1 1 1 1 1 1
# SKA1 SV2C SYCP3 TOX
# 1 1 1 1
“但是将所有数据框放入列表的好方法是什么?”这是非常彻底的回答in this answer.
答案 1 :(得分:1)
我的方法与格雷戈尔不同。如果我正确理解了问题,请尝试以下操作。
count_genes <- function(DF1, DF_list){
genes <- lapply(DF_list, `[[`, 2)
genes <- unlist(lapply(genes, as.character))
res <- sapply(DF1[[2]], function(x) sum(genes %in% as.character(x)))
names(res) <- DF1[[2]]
res
}
df_list <- list(df2)
count_genes(df1, df_list)