我有一个由蛋白质(字符)和它们相互作用的蛋白质组成的列表,以及构成簇(蛋白质载体)的蛋白质载体。我想构建一个data.frames列表,计算字符向量元素在列表的每个元素中出现的次数,
请在此处找到我的数据摘要,以获得解释性示例:
>list1
$ENSG00000206212
[1] "SNORA70" "RPS27" "HIST1H3F" "RPL37" "ENSG00000196656" "EIF4E" "EIF3I" "FAU"
[9] "EIF4H" "ENSG00000206212" "WDR3" "HIST1H4C" "EIF3C" "SNORD33" "RPS12" "HIST1H4B"
[17] "RPL39" "ENSG00000173534" "RPL7" "HIST1H3I" "HSPA5" "NOL6" "RPL17P39" "ENSG00000182498"
[25] "RPL30" "ENO1" "EMG1" "RPL8" "RPS27A" "RPL26" "BOP1" "NOC4L"
[33] "HIST1H3D" "IDH3A" "YBX1" "LOC100505503" "EIF3J" "RPL35A" "PDCD11" "ENSG00000182953"
[41] "RPS11" "HIST1H4J" "DDX27" "HIST1H4D" "ATP5A1" "ENSG00000204221" "RPL29" "CAMK2D"
[49] "WDR74" "ENSG00000183311" "ENSG00000206284" "ENSG00000175333" "HIST1H2BE" "RPS20" "RPS7P1" "RPL36"
[57] "KRR1" "HNRNPH1" "HEATR1" "SLC25A4" "HIST1H2BI" "TBK1" "RPL11" "UTP20"
[65] "RPL3L" "PRPF19" "MYL6" "RPS14" "RPS17" "EIF3K" "RPL18" "BXDC2"
[73] "ENSG00000187899" "RPL27AP" "HNRNPK" "HIST1H3H" "DDX5" "HIST1H2AB" "CAPZB" "S100A6"
[81] "HIST1H4H" "RPS19" "BXDC5" "PHB" "EIF5A" "MYL12A" "ENSG00000206212" "DECR1"
[89] "RPL27" "RPL15P3" "HIST2H4B" "DDX18" "WIZ" "RPL4" "TUBB2C" "RPS5"
[97] "NOP14" "ENSG00000198637" "RPL34" "ENSG00000138396" "EIF3B" "HIST1H2BH" "RUVBL2" "FTSJ3"
[105] "EIF4A1" "HIST1H2BG" "EIF2S3" "EIF3E" "HIST1H4K" "HSPA8" "IMP3" "UTP15"
[113] "RRP9" "RPL13AP25" "IMP4" "RPL21P28" "SNORD55" "PWP2" "ENSG00000197303" "EIF4A3"
[121] "RPS13" "BYSL" "FBL" "RPS3" "RPL38" "HIST1H2AE" "NFKB2" "HIST1H4I"
$SNORD24
[1] "HIST1H2AI" "RPLP2" "ENSG00000130041" "RPL27" "RPL26" "RPS15" "RPL9" "YBX1"
[9] "ENSG00000196656" "RPL15P3" "HIST1H2AL" "RPS4Y1" "RPS26" "RPL14" "RPS19" "EEF1A1P9"
[17] "RPLP1" "RPL12" "SF3B3" "GRINL1B" "BXDC2" "LAS1L" "DDX27" "SSRP1"
[25] "DDX5" "WDR12" "ENSG00000173534" "RPS4X" "EIF3I" "RPL7" "RPS26P2" "ENSG00000182498"
[33] "RPL39" "RPLP0" "DDX18" "RPL28" "HNRNPA1"
$ENSG00000187899
[1] "EMG1" "WDR3" "RPL8" "ENSG00000182498" "ENSG00000206212" "SNORD33" "CIRH1A" "LARP2"
[9] "ENSG00000187899" "RPL17P39" "SNORA70" "UTP11L" "NOL6" "LOC100130702" "RPL30" "ENSG00000174766"
[17] "MPHOSPH10" "RSL1D1" "RPL18" "ENSG00000096150" "RPL4" "UTP20" "RPS4X" "PRPF19"
[25] "RBM19" "RPL27AP" "PDCD11" "KRR1" "IMP4" "ENSG00000204221" "IMP3" "RRP9"
[33] "FBL" "RPL3" "HSP90AB1" "EIF5A" "RPS19" "TBL3" "RPS15" "SNORD4A"
[41] "AATF" "RPL19" "ENSG00000007816" "NOP58" "RPLP0" "RIOK2" "DHX37" "WDSOF1"
[49] "MKI67IP" "SMARCB1" "RPL10A" "ENSG00000138396" "RPL15P3" "RPS17" "SNORD73A" "RPL14"
[57] "RPL18A" "NOL10" "ENSG00000175333" "RPL7P" "ENSG00000187899" "RPL18AP11" "PSMD4" "RPL9"
[65] "PNO1" "RPS5" "RPL13AP25" "NAF1" "RPL14P1"
$RPLP2
[1] "RPLP2" "SSPO" "EEF2" "ENSG00000182498" "MAP3K14" "DDX55" "RPL30" "RPL17P39"
[9] "RPS29" "HIST1H4B" "RPL12" "RPS12" "SNORA70" "MT3" "PKM2" "ENSG00000198637"
[17] "PWP2" "RPL18" "HIST1H4J" "HNRNPH2" "RPS27" "RELB" "ACTG1" "ENSG00000174766"
[25] "RRP9" "HIST1H4I" "RPS26" "DCP2" "HIST1H4C" "RPL39" "UTP11L" "ENSG00000206212"
[33] "SEC23A" "RPL37" "TUBB4" "MKI67IP" "ENSG00000196656" "FAU" "ESF1" "RPS4X"
[41] "RPL8" "BMS1" "ENSG00000206284" "HSPA5" "PXN" "RPL29" "SNORD33" "HSPA8"
[49] "LOC100130702" "RPL35A" "SSPO" "RPL14" "GSPT2" "PPP1CB" "WDR3" "HIST2H4A"
[57] "MRPL2" "PPP1R12A" "RPS15A" "RPL26L1" "DDX49" "RPL7" "RCL1" "ENSG00000187899"
$RPL15P3
[1] "RPL19" "RPL32P18" "RPL36AL" "ENSG00000175333" "RPL15P3" "RPL18" "RPL9P7" "IMPDH1"
[9] "ENSG00000206212" "ENSG00000096150" "RPS4X" "RPS15A" "DDX18" "DDX24" "SNORD33" "RPL27A"
[17] "LARP2" "RPF2" "ENSG00000187899" "RPS7P1" "PWP1" "RPL12" "RPL21P28" "RPL17P39"
[25] "RPL15" "RRS1" "RPLP2" "PDCD11" "DDX54" "RPL8" "ENSG00000182498" "RPS25"
[33] "LOC100130702" "RPL30" "CEBPZ" "SNORD55" "RBM34" "HSPA8" "ATP6V1B2" "RPL3"
[41] "RPL4" "GAR1" "SNORA70" "RPS17" "KPNB1" "KIAA0020" "RPL13AP25" "SNORD21"
[49] "ENSG00000174766" "RPS5" "BXDC2" "DDX3Y" "SNORD24" "RPL7P" "RPL32" "RPS13"
[57] "SUMO1P3" "RPL27" "RPS7" "RPS3" "PES1" "RPL15P3" "RPS27A" "ENSG00000185637"
[65] "ENSG00000212981" "RPL38" "RPS6" "NOC3L" "RPL18AP11" "ENSG00000138396" "RRP15" "UBC"
$RPL19
[1] "ACTG1" "RPL8" "RPL15" "RPS29" "RPS27" "TCF12" "RPL37" "RPLP2"
[9] "RPL18" "ENSG00000096150" "RPL29" "RPL4" "HSPA8" "DDX18" "LOC100130702" "ENSG00000182498"
[17] "EXOC2" "ENSG00000187899" "FTSJ3" "NOTCH1" "FAU" "PXN" "RPS26" "RPL14"
[25] "DYNLL2" "RPL19" "ENSG00000198637" "RPL30" "RPS4X" "TEX10" "NIP7" "RPL12"
[33] "RPL35A" "ENSG00000196656" "RPL7" "RPL26L1" "PCBP3" "DHX15" "RPL17P39" "RPS12"
[41] "RPS15A" "MKI67IP" "GSPT2" "SNORD33" "RRP1B" "ENSG00000206212" "SNORA70" "RPL21P28"
[49] "ENSG00000174766" "RPL39" "RPL15P3" "RPL28" "YARS" "ENSG00000138396" "ITGA8" "YWHAZ"
[57] "KRR1" "ETF1" "SNORA62" "ACTB" "RPL37A" "PES1" "RPS6" "NOP56"
>clusters
[1] "ENSG00000206212" "SNORD24" "ENSG00000187899" "RPLP2" "RPL15P3" "RPL19" "EIF4A1" "RPS3"
[9] "RPLP0" "RPS15" "ENSG00000196656" "RPL21P28" "RPS4X"
我正在寻找的输出示例是:
[[1]]
lineNum count
ENSG00000206212 1 2
SNORD24 2 1
ENSG00000187899 3 1
RPLP2 4 1
RPL15P3 5 1
RPL19 6 1
EIF4A1 7 1
RPS3 8 1
RPLP0 9 1
RPS15 10 1
ENSG00000196656 11 1
RPL21P28 12 1
RPS4X 13 1
[[2]]
lineNum count
ENSG00000206212 1 1
SNORD24 2 2
ENSG00000187899 3 1
RPLP2 4 1
RPL15P3 5 1
RPL19 6 1
EIF4A1 7 0
RPS3 8 1
RPLP0 9 1
RPS15 10 1
ENSG00000196656 11 1
RPL21P28 12 1
RPS4X 13 1
[[3]]
lineNum count
ENSG00000206212 1 1
SNORD24 2 1
ENSG00000187899 3 2
RPLP2 4 1
RPL15P3 5 1
RPL19 6 1
EIF4A1 7 0
RPS3 8 1
RPLP0 9 1
RPS15 10 1
ENSG00000196656 11 0
RPL21P28 12 0
RPS4X 13 1
我无法发布dput
我的数据,因为它太大了,我希望此示例仅用于说明目的
我正在使用以下命令行,但它给我一个错误的结果,因为有时我得到的值高于1
ll <- lapply(clusters, function(z) data.frame(lineNum=1:length(list1), count=sapply(list1, function(x) sum(str_count(x,z)))))
由于
更新:
另一个例子,我希望更清楚:
>list1
$HSP90AB1
[1] "TALDO1" "ENSG00000130041" "HSP90AB1" "CLNS1A" "IMNT" "CKB" "RUVBL1" "NR2E1" "FAM162A"
$INMT
[1] "COX4I1" "ME3" "THUMPD1" "KLHL8" "COX4I2" "KIRREL2" "HSP90AB1"
$CKB
[1] "THUMPD1" "SLC12A5" "NFKB1" "RPS3" "ME3" "ASNA1" "KLHL8" "SLC12A4" "CKM" "HSP90AB1" "BLOC1S1" "SERP2" "ASB9"
$NR2E1
[1] "GSPT1" "GSPT2" "KPNA3" "HSP90AB1" "EIF2B1" "TLE4"
$ME3
[1] "SFRS18" "CKB" "CKM" "INMT"
$FAM162A
[1] "HSP90AA2" "IKBKE" "HSP90AB1"
$KIRREL2
[1] "INMT"
输出如下:
[[1]]
lineNum count
HSP90AB1 1 2
INMT 2 1
CKB 3 1
NR2E1 4 1
ME3 5 0
FAM162A 6 1
KIRREL2 7 0
[[2]]
lineNum count
HSP90AB1 1 1
INMT 2 0
CKB 3 0
NR2E1 4 0
ME3 5 1
FAM162A 6 0
KIRREL2 7 1
[[3]]
lineNum count
HSP90AB1 1 1
INMT 2 0
CKB 3 0
NR2E1 4 0
ME3 5 1
FAM162A 6 0
KIRREL2 7 0
[[4]]
lineNum count
HSP90AB1 1 1
INMT 2 0
CKB 3 0
NR2E1 4 0
ME3 5 0
FAM162A 6 0
KIRREL2 7 0
[[5]]
lineNum count
HSP90AB1 1 0
INMT 2 1
CKB 3 1
NR2E1 4 0
ME3 5 0
FAM162A 6 0
KIRREL2 7 0
[[6]]
lineNum count
HSP90AB1 1 1
INMT 2 0
CKB 3 0
NR2E1 4 0
ME3 5 0
FAM162A 6 0
KIRREL2 7 0
[[7]]
lineNum count
HSP90AB1 1 0
INMT 2 1
CKB 3 0
NR2E1 4 0
ME3 5 0
FAM162A 6 0
KIRREL2 7 0
但输出的第一个元素应仅由1s
组成(clusters
中的元素,并显示在list1
)或0s
(不匹配),即$ HSP90AB1应为1
而非2
答案 0 :(得分:2)
未测试,因为您没有为较小的示例提供dput
,但请尝试以下操作:
lapply(list1, function(l, x) {
tab <- table(l)
out <- data.frame(lineNum = seq_along(x),
count = ifelse(x %in% names(tab), tab[x], 0L))
rownames(out) <- x
out
}, x = clusters)
答案 1 :(得分:1)
我认为这与你想要的非常接近(尽管不是很精确)。只要列表中的所有蛋白质都与列表中的所有蛋白质匹配(因此匹配因子),它就会起作用 - 否则它就会响起。
我编辑了代码以显示如何处理未分类(按名称)列表。
修改强>
L =列表(B = c(“A”,“A”,“B”,“B”,“C”),A = c(“A”,“B”,“B”,“B” “,”A“),C = c(”B“,”C“,”B“,”B“,”C“))
L = L [顺序(名称(L))]
<强> /修改
纳米=名(L)
lapply(L,function(l)as.data.frame(table(factor(l,levels = nm))))
$A
Var1 Freq
1 A 2
2 B 2
3 C 1
$B
Var1 Freq
1 A 2
2 B 3
3 C 0
$C
Var1 Freq
1 A 0
2 B 3
3 C 2