从data.frame过滤数据

时间:2014-12-09 11:56:19

标签: r

我不知道应该放置什么名称的线程。如果有人有任何更好的想法,请更改它。

我有一个大数据集,看起来像这样:

> dput(head(tbl_all1))
structure(list(Gene.name = structure(1:6, .Label = c("at1g01050.1", 
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01320.2", "at1g01420.1", 
"at1g01470.1", "at1g01800.1", "at1g01910.5"), class = "factor"), 
    X1_1 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_2 = c(0, 0, 0, 0, 0, 
    0), X1_3 = c(0, 1.91253218, 0, 0, 0, 0), X1_4 = c(0, 1.263597939, 
    0, 0, 0, 0), X1_5 = c(0, 0.997262469, 0, 0, 0, 1), X1_6 = c(0, 
    0.836333482, 0.186450525, 0, 0, 1), X1_7 = c(0.713761294, 
    0.998433284, 0, 0, 0, 0), X1_8 = c(0, 1.002737531, 0.105799533, 
    0, 0, 0), X1_9 = c(1.143119353, 0.720766625, 0.763452684, 
    1, 0, 0), X1_10 = c(0, 0, 0.979400044, 0, 0, 0), X1_11 = c(0, 
    0.715595925, 1.120768885, 0, 1, 0), X1_12 = c(0L, 0L, 1L, 
    0L, 1L, 0L), X1_13 = c(0, 0, 1.276209448, 0, 1.106174824, 
    0), X1_14 = c(0, 0, 0.970143925, 0, 0.897284653, 0), X1_15 = c(0L, 
    0L, 0L, 0L, 0L, 0L), X1_16 = c(0, 0, 0.85529218, 0, 0.678275003, 
    0), X1_17 = c(0, 0, 0, 0, 1.313616463, 0), X1_18 = c(0, 0, 
    1.018244397, 0, 0.731395183, 0), X1_19 = c(0, 0, 2.138711024, 
    0, 1.268604817, 0), X1_20 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_21 = c(0, 
    0, 1.554696031, 0, 2.128263835, 0), X1_22 = c(0L, 0L, 0L, 
    0L, 0L, 0L), X1_23 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_24 = c(0, 
    0.553801084, 1.681551744, 0, 1, 0)), .Names = c("Gene.name", 
"X1_1", "X1_2", "X1_3", "X1_4", "X1_5", "X1_6", "X1_7", "X1_8", 
"X1_9", "X1_10", "X1_11", "X1_12", "X1_13", "X1_14", "X1_15", 
"X1_16", "X1_17", "X1_18", "X1_19", "X1_20", "X1_21", "X1_22", 
"X1_23", "X1_24"), row.names = c(NA, 6L), class = "data.frame")

在我的真实数据中,它只有3000行。在第一列中,您已获得基因的名称(at1g01090.1),我想通过其他data.frames过滤这些行。

我有20个其他小型数据框架(与我相比较小)看起来像这样:

> dput(head(tbl_check))
structure(list(locus.description = structure(1:6, .Label = c("AT1G02000,GAE2", 
"AT1G02640,BXL2", "AT1G02730,CSLD5", "AT1G02790,PGA4", "AT1G02810,pectinesterase 7", 
"AT1G02850,BGLU11", "AT1G04920,SPS3F", "AT1G05610,APS2", "AT1G06020,putative fructokinase-3", 
"AT1G06030,probable fructokinase-2", "AT1G06410,TPS7", "AT1G06780,GAUT6", 
"AT1G11580,PMEPCRA", "AT1G11590,probable pectinesterase/pectinesterase inhibitor 19", 
"AT1G12240,ATBETAFRUCT4", "AT1G16980,TPS2", "AT1G17000,TPS3", 
"AT1G18580,GAUT11", "AT1G22210,probable trehalose-phosphate phosphatase C", 
"AT1G23190,putative phosphoglucomutase", "AT1G23870,TPS9", "AT1G26560,BGLU40", 
"AT1G26570,UGD1", "AT1G27680,APL2", "AT1G35910,probable trehalose-phosphate phosphatase D", 
"AT1G45191,beta-glucosidase 1", "AT1G47840,HXK3", "AT1G50460,HKL1", 
"AT1G53830,PME2", "AT1G60140,TPS10", "AT1G61820,BGLU46", "AT1G62660,beta-fructofuranosidase", 
"AT1G66270,BGLU21", "AT1G66280,BGLU22", "AT1G66430,pfkB-like carbohydrate kinase family protein", 
"AT1G68020,ATTPS6", "AT1G69940,PPME1", "AT1G70290,TPS8"), class = "factor")), .Names = "locus.description", row.names = c(NA, 
6L), class = "data.frame")

我想过滤我的第一个data.frame tbl_all1,然后留下那些可以在这20个数据框中找到的基因。当然,我想留下其他列,其值为tbl_all1

这就是我自己做的事情:

##LOAD files
tbl_all1 <- read.csv ("1st gel - 24 fractions1.csv", sep=",") 

## Other folder now
list_of_data = list.files(pattern="*.csv")
tbl_met = lapply(list_of_data, read.csv)
tbl <- rbindlist(tbl_met) ## binding all of the tables in the list by row

vec_names <- tbl$locus ## name of the column with names which I am interested in
vec <- unique(vec_names) ## removing the duplicates


## Selecting the metabolism genes in data sets

names_all <- substr(tbl_all[,1],0,9)

tbl_all[,1] <- names_all

## Changing lowercase letters for uppercase
tbl_ready <- data.frame(lapply(tbl_all, function(v) {
  if (is.character(v)) return(toupper(v))
  else return(v)
}))

data<-tbl_ready[tbl_ready$Gene.name %in% vec,]

这是完美的,但问题是我想知道每个基因在过滤后来自哪个数据框。我的意思是我想在我的输出中添加一个新列,说明每个基因来自哪个数据表。

这是我加载的文件列表:

> list_of_data
 [1] "KEGG AminoAcid Metabolism.csv"          
 [2] "KEGG basal TFs.csv"                     
 [3] "KEGG Carotinoid Biosynthesis.csv"       
 [4] "KEGG Chlorophyll Biosynthesis.csv"      
 [5] "KEGG DNA replication.csv"               
 [6] "KEGG Glucosinolate Metabolism.csv"      
 [7] "KEGG Light Harvesting.csv"              
 [8] "KEGG Photosynthesis.csv"                
 [9] "KEGG Plastoquinone Biosynthesis.csv"    
[10] "KEGG Proteasome.csv"                    
[11] "KEGG regul autophagy.csv"               
[12] "KEGG RNA Polymerase.csv"                
[13] "KEGG sulfate metabolism.csv"            
[14] "KEGG Terpenoid Backbone_Isoprenoids.csv"
[15] "KEGG thiamine metabolism.csv"           
[16] "KEGG Tryptophan BS.csv"                 
[17] "KEGG_starch.csv"                        
[18] "KEGG_TCA cycle.csv"                     
[19] "RNA degradation.csv"                    
[20] "tRNA synthesis.csv"     

希望有足够的信息帮助我!

我的合并数据:

> dput(head(tbl_data))
structure(list(X = c(3L, 19L, 28L, 33L, 34L, 35L), Gene.name = structure(1:6, .Label = c("AT1G01090", 
"AT1G02500", "AT1G03130", "AT1G03330", "AT1G03475", "AT1G03630", 
"AT1G04410", "AT1G04810", "AT1G06410", "AT1G06570", "AT1G06680", 
"AT1G07370", "AT1G07780", "AT1G08490"), class = "factor"), X1_1 = c(0, 
0, 0, 0, 0, 0), X1_2 = c(0, 0, 0, 1.133759575, 0, 0), X1_3 = c(0, 
1.389359906, 0, 0, 1, 0), X1_4 = c(0, 0.628125036, 0, 1.161302798, 
1, 0), X1_5 = c(0, 1.231056083, 0, 0.50892594, 0.01594858, 0), 
    X1_6 = c(0.186450525, 1, 0, 0.514811996, 0, 0), X1_7 = c(0, 
    1.149134552, 0, 0, 0, 0), X1_8 = c(0.105799533, 0.386877662, 
    0, 0, 0, 0), X1_9 = c(0.763452684, 0.554554123, 0, 0, 0, 
    0), X1_10 = c(0.979400044, 0, 0, 0, 0, 0), X1_11 = c(1.120768885, 
    0.274641072, 0, 0, 0, 0.690696043), X1_12 = c(1, 0, 0, 0, 
    0, 0), X1_13 = c(1.276209448, 0, 0, 0, 0, 0), X1_14 = c(0.970143925, 
    0, 1, 0, 0, 0), X1_15 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_16 = c(0.85529218, 
    0, 0, 0, 0, 1), X1_17 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_18 = c(1.018244397, 
    0, 0, 0, 0, 0), X1_19 = c(2.138711024, 0, 0, 0, 0, 0), X1_20 = c(0L, 
    0L, 0L, 0L, 0L, 0L), X1_21 = c(1.554696031, 0, 0, 0, 0, 0
    ), X1_22 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_23 = c(0L, 0L, 0L, 
    0L, 0L, 0L), X1_24 = c(1.681551744, 0, 1, 0, 0, 1.309303957
    )), .Names = c("X", "Gene.name", "X1_1", "X1_2", "X1_3", 
"X1_4", "X1_5", "X1_6", "X1_7", "X1_8", "X1_9", "X1_10", "X1_11", 
"X1_12", "X1_13", "X1_14", "X1_15", "X1_16", "X1_17", "X1_18", 
"X1_19", "X1_20", "X1_21", "X1_22", "X1_23", "X1_24"), row.names = c(NA, 
6L), class = "data.frame")

0 个答案:

没有答案