从基于另一个列表的列表中解析和计算元素(字符)

时间:2013-12-08 01:43:25

标签: r parsing

我有一个由蛋白质组成的清单:

>head(PPI)
$A1CF
[1] "SYNCRIP" "KHSRP"  

$A2LD1
[1] "PRPSAP2" "RPL15"  

$A2M
[1] "MMP2"     "NGF"      "IL10"     "CELA1"    "KLK3"     "C11orf58" "LCAT"     "IL1B"      "KLK13"    "ANXA6"    "SERPINA1" "TGFBI"    "KLK5"     "LRP1"    
[15] "PDGFA"    "ADAMTS1"  "KLK2"     "KLKB1"    "F2"       "CPB2"     "MYOC"     "PLG"      "CTSE"     "ADAM19"   "SHBG"     "PAEP"     "HSPA5"    "APOE"    
[29] "SPACA3"   "APP"      "PDGFB"   

$AAAS
[1] "EP300"  "NUP214" "NUP133" "NUP37"  "NUP35"  "RANBP2" "NUP210"

$AAGAB
[1] "EIF3C"  "UNC119" "AFTPH" 

$AAK1
[1] "PRKAA1"   "SPEG"     "JAK1"     "KIAA0195" "AURKB"    "MAPK6"    "FER"      "PDE4A"        "ALPK3"    "HIPK1"    "MAP4K5"   "LSM14A"   "TBKBP1"   "FRYL"    
[15] "SIK2"     "PKN3"     "ACOX3"    "MAP4K2"   "TAOK1"    "SIK3"     "AZI2"     "TESK2"    "TBK1"     "KIAA0528" "PTPN18"   "PIP4K2C"  "CAMK2G"   "CABC1"   
[29] "NEK11"

和我要解析的第二个列表(也包含蛋白质名称):

>head(mylist)
$cluster.1
[1] "HSP90AB1" "INMT"     "CKB"      "NR2E1"    "ME3"      "FAM162A"  "KIRREL2" 

$cluster.2
[1] "ENSG00000212860" "TRADD"           "C1QBP"           "KIAA1967"        "ENSG00000137379" "MAP3K3"          "TNFRSF1B"        "BAG2"           
[9] "ENSG00000212866" "RIPK3"           "EPRS"            "HSPA6"           "HSPA5"           "IKBKG"           "TBK1"            "TRAF2"          
[17] "MAP3K7"          "NFKB1"           "MAP3K14"         "HSPA1A"          "MAP3K7IP2"       "HSPBP1"          "NFKB2"           "DNAJA1"         
[25] "TNFRSF1A"        "TRAF3IP2"        "NFKBIA"          "HSPA9"           "ENSG00000183311" "TUBB"            "TUBA3D"          "TANK"           
[33] "ENSG00000215292" "REL"             "MAP3K1"          "HSPA1B"          "HSPA8"           "NFKBIB"          "PGAM5"           "EEF1A2"         
[41] "MAP3K8"          "CLTC"            "RCN2"            "MAP3K7IP1"       "RARS"            "TRAF1"           "TUBA3C"          "HSPA1L"         
[49] "MYO1D"           "NOD1"            "HSP90AA2"        "CAD"             "RELB"            "AIFM1"           "TUBB2B"          "RIPK2"          
[57] "CDC37"           "IKBKB"           "ERLIN1"          "RIPK1"           "TNIP2"           "STUB1"           "TUBB4"           "HSPA2"          
[65] "CHUK"            "DNAJC3"          "CCDC50"          "SLC25A5"         "NFKBIE"          "AK3"             "TICAM1"          "TIMM50"         
[73] "ANKRD17"         "OTUD7B"          "TNFAIP3"         "RPS27L"          "TRPC4AP"         "TUBB6"           "DNAJC6"          "PXMP2"          
[81] "FLJ25006"       

$cluster.3
[1] "ACTB"    "PFN1"    "XPO6"    "VASP"    "ZYX"     "PFN2"    "DIAPH1"  "APBB1IP" "DIAPH2"  "PARVG"   "ENAH"    "PCYT1B"  "PFN4"    "CNN2"    "NSMAF"   "PFN3"   
[17] "LMOD1"  

$cluster.4
[1] "UBB"             "HERC3"           "KLRK1"           "ULBP1"           "RAET1E"          "MICA"            "HCST"            "ENSG00000184444"
[9] "ENSG00000206449" "ULBP2"           "ZNF385A"         "ULBP3"           "RAET1G"         

$cluster.5
[1] "YWHAZ"    "SLAIN2"   "ZC3H13"   "C12orf51" "PGLYRP1"  "ATL3"    

$cluster.6
[1] "ACTG1"   "EPS8L3"  "PARVG"   "TMSB4Y"  "B3GALT1" "UGT1A6"

我想计算list2 mylist元素的每个成员使用PPI中包含的信息与集群的其他成员进行交互的次数。我正在遵循这种方法:

PPI_sub <- PPI[mylist[[1]]]
c1.share <- lapply(mylist[[1]], function(z) data.frame(lineNum=1:length(PPI_sub), count=sapply(PPI_sub, function(x) sum(str_count(x,z)))))
names(c1.share) <- mylist[[1]]
c1.share
    $HSP90AB1
         lineNum count
HSP90AB1       1     1
INMT           2     0
CKB            3     0
NR2E1          4     1
ME3            5     0
FAM162A        6     0
KIRREL2        7     0

$INMT
         lineNum count
HSP90AB1       1     1
INMT           2     0
CKB            3     0
NR2E1          4     0
ME3            5     0
FAM162A        6     0
KIRREL2        7     1

$CKB
         lineNum count
HSP90AB1       1     1
INMT           2     0
CKB            3     0
NR2E1          4     0
ME3            5     1
FAM162A        6     0
KIRREL2        7     0

$NR2E1
         lineNum count
HSP90AB1       1     0
INMT           2     0
CKB            3     0
NR2E1          4     0
ME3            5     0
FAM162A        6     0
KIRREL2        7     0

$ME3
         lineNum count
HSP90AB1       1     0
INMT           2     1
CKB            3     0
NR2E1          4     0
ME3            5     0  
FAM162A        6     0
KIRREL2        7     0

$FAM162A
         lineNum count
HSP90AB1       1     1
INMT           2     0
CKB            3     0
NR2E1          4     0
ME3            5     0
FAM162A        6     0
KIRREL2        7     0

$KIRREL2
         lineNum count
HSP90AB1       1     0
INMT           2     0
CKB            3     0
NR2E1          4     0
ME3            5     0
FAM162A        6     0
KIRREL2        7     0

现在,如果我计算群集中每个成员的ones的数量以及其余成员显示的次数,它将向我提供我正在寻找的数字:

HSP90AB1       5     
INMT           3     
CKB            2    
NR2E1          1     
ME3            2       
FAM162A        1     
KIRREL2        1

我的问题是我不知道如何自动获取这些最终值并与mylist

中的所有元素进行交互

P.S。这是PPI

中cluster1的元素
$HSP90AB1
[1] "CKB"   "PDHA1" "ENTPD6"    "FAM162A"   "INMT"  "BOLA2" "MVP"  "HSP90AB1"                     

$INMT
[1] "COX4I1"  "ME3"     "THUMPD1" "KLHL8"   "COX4I2" 

$CKB
[1] "THUMPD1"

$NR2E1
[1] "GSPT1"    "GSPT2"    "KPNA3"    "HSP90AB1" "EIF2B1"   "TLE4"    

$ME3
[1] "SFRS18" "CKB"    "CKM"   

$FAM162A
[1] "HSP90AA2"

$KIRREL2
[1] "INMT"

非常感谢

1 个答案:

答案 0 :(得分:1)

您可以使用Reduce

总结向量列表
Reduce(`+`, c1share)

概括代码并不难,只需将mylist[[1]]替换为变量,并将其粘贴在函数中。

f <- function(mylistsub) {
   PPI_sub <- PPI[mylistsub]
   c1.share <- lapply(mylistsub, function(z) data.frame(lineNum=1:length(PPI_sub),        count=sapply(PPI_sub, function(x) sum(str_count(x,z)))))
   names(c1.share) <-mylistsub
   Reduce(`+`, c1share)
}

lapply(mylist, f)