我不知道应该放置什么名称的线程。如果有人有任何更好的想法,请更改它。
我有一个大数据集,看起来像这样:
> dput(head(tbl_all1))
structure(list(Gene.name = structure(1:6, .Label = c("at1g01050.1",
"at1g01080.1", "at1g01090.1", "at1g01220.1", "at1g01320.2", "at1g01420.1",
"at1g01470.1", "at1g01800.1", "at1g01910.5"), class = "factor"),
X1_1 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_2 = c(0, 0, 0, 0, 0,
0), X1_3 = c(0, 1.91253218, 0, 0, 0, 0), X1_4 = c(0, 1.263597939,
0, 0, 0, 0), X1_5 = c(0, 0.997262469, 0, 0, 0, 1), X1_6 = c(0,
0.836333482, 0.186450525, 0, 0, 1), X1_7 = c(0.713761294,
0.998433284, 0, 0, 0, 0), X1_8 = c(0, 1.002737531, 0.105799533,
0, 0, 0), X1_9 = c(1.143119353, 0.720766625, 0.763452684,
1, 0, 0), X1_10 = c(0, 0, 0.979400044, 0, 0, 0), X1_11 = c(0,
0.715595925, 1.120768885, 0, 1, 0), X1_12 = c(0L, 0L, 1L,
0L, 1L, 0L), X1_13 = c(0, 0, 1.276209448, 0, 1.106174824,
0), X1_14 = c(0, 0, 0.970143925, 0, 0.897284653, 0), X1_15 = c(0L,
0L, 0L, 0L, 0L, 0L), X1_16 = c(0, 0, 0.85529218, 0, 0.678275003,
0), X1_17 = c(0, 0, 0, 0, 1.313616463, 0), X1_18 = c(0, 0,
1.018244397, 0, 0.731395183, 0), X1_19 = c(0, 0, 2.138711024,
0, 1.268604817, 0), X1_20 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_21 = c(0,
0, 1.554696031, 0, 2.128263835, 0), X1_22 = c(0L, 0L, 0L,
0L, 0L, 0L), X1_23 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_24 = c(0,
0.553801084, 1.681551744, 0, 1, 0)), .Names = c("Gene.name",
"X1_1", "X1_2", "X1_3", "X1_4", "X1_5", "X1_6", "X1_7", "X1_8",
"X1_9", "X1_10", "X1_11", "X1_12", "X1_13", "X1_14", "X1_15",
"X1_16", "X1_17", "X1_18", "X1_19", "X1_20", "X1_21", "X1_22",
"X1_23", "X1_24"), row.names = c(NA, 6L), class = "data.frame")
在我的真实数据中,它只有3000行。在第一列中,您已获得基因的名称(at1g01090.1),我想通过其他data.frames过滤这些行。
我有20个其他小型数据框架(与我相比较小)看起来像这样:
> dput(head(tbl_check))
structure(list(locus.description = structure(1:6, .Label = c("AT1G02000,GAE2",
"AT1G02640,BXL2", "AT1G02730,CSLD5", "AT1G02790,PGA4", "AT1G02810,pectinesterase 7",
"AT1G02850,BGLU11", "AT1G04920,SPS3F", "AT1G05610,APS2", "AT1G06020,putative fructokinase-3",
"AT1G06030,probable fructokinase-2", "AT1G06410,TPS7", "AT1G06780,GAUT6",
"AT1G11580,PMEPCRA", "AT1G11590,probable pectinesterase/pectinesterase inhibitor 19",
"AT1G12240,ATBETAFRUCT4", "AT1G16980,TPS2", "AT1G17000,TPS3",
"AT1G18580,GAUT11", "AT1G22210,probable trehalose-phosphate phosphatase C",
"AT1G23190,putative phosphoglucomutase", "AT1G23870,TPS9", "AT1G26560,BGLU40",
"AT1G26570,UGD1", "AT1G27680,APL2", "AT1G35910,probable trehalose-phosphate phosphatase D",
"AT1G45191,beta-glucosidase 1", "AT1G47840,HXK3", "AT1G50460,HKL1",
"AT1G53830,PME2", "AT1G60140,TPS10", "AT1G61820,BGLU46", "AT1G62660,beta-fructofuranosidase",
"AT1G66270,BGLU21", "AT1G66280,BGLU22", "AT1G66430,pfkB-like carbohydrate kinase family protein",
"AT1G68020,ATTPS6", "AT1G69940,PPME1", "AT1G70290,TPS8"), class = "factor")), .Names = "locus.description", row.names = c(NA,
6L), class = "data.frame")
我想过滤我的第一个data.frame tbl_all1
,然后留下那些可以在这20个数据框中找到的基因。当然,我想留下其他列,其值为tbl_all1
。
这就是我自己做的事情:
##LOAD files
tbl_all1 <- read.csv ("1st gel - 24 fractions1.csv", sep=",")
## Other folder now
list_of_data = list.files(pattern="*.csv")
tbl_met = lapply(list_of_data, read.csv)
tbl <- rbindlist(tbl_met) ## binding all of the tables in the list by row
vec_names <- tbl$locus ## name of the column with names which I am interested in
vec <- unique(vec_names) ## removing the duplicates
## Selecting the metabolism genes in data sets
names_all <- substr(tbl_all[,1],0,9)
tbl_all[,1] <- names_all
## Changing lowercase letters for uppercase
tbl_ready <- data.frame(lapply(tbl_all, function(v) {
if (is.character(v)) return(toupper(v))
else return(v)
}))
data<-tbl_ready[tbl_ready$Gene.name %in% vec,]
这是完美的,但问题是我想知道每个基因在过滤后来自哪个数据框。我的意思是我想在我的输出中添加一个新列,说明每个基因来自哪个数据表。
这是我加载的文件列表:
> list_of_data
[1] "KEGG AminoAcid Metabolism.csv"
[2] "KEGG basal TFs.csv"
[3] "KEGG Carotinoid Biosynthesis.csv"
[4] "KEGG Chlorophyll Biosynthesis.csv"
[5] "KEGG DNA replication.csv"
[6] "KEGG Glucosinolate Metabolism.csv"
[7] "KEGG Light Harvesting.csv"
[8] "KEGG Photosynthesis.csv"
[9] "KEGG Plastoquinone Biosynthesis.csv"
[10] "KEGG Proteasome.csv"
[11] "KEGG regul autophagy.csv"
[12] "KEGG RNA Polymerase.csv"
[13] "KEGG sulfate metabolism.csv"
[14] "KEGG Terpenoid Backbone_Isoprenoids.csv"
[15] "KEGG thiamine metabolism.csv"
[16] "KEGG Tryptophan BS.csv"
[17] "KEGG_starch.csv"
[18] "KEGG_TCA cycle.csv"
[19] "RNA degradation.csv"
[20] "tRNA synthesis.csv"
希望有足够的信息帮助我!
我的合并数据:
> dput(head(tbl_data))
structure(list(X = c(3L, 19L, 28L, 33L, 34L, 35L), Gene.name = structure(1:6, .Label = c("AT1G01090",
"AT1G02500", "AT1G03130", "AT1G03330", "AT1G03475", "AT1G03630",
"AT1G04410", "AT1G04810", "AT1G06410", "AT1G06570", "AT1G06680",
"AT1G07370", "AT1G07780", "AT1G08490"), class = "factor"), X1_1 = c(0,
0, 0, 0, 0, 0), X1_2 = c(0, 0, 0, 1.133759575, 0, 0), X1_3 = c(0,
1.389359906, 0, 0, 1, 0), X1_4 = c(0, 0.628125036, 0, 1.161302798,
1, 0), X1_5 = c(0, 1.231056083, 0, 0.50892594, 0.01594858, 0),
X1_6 = c(0.186450525, 1, 0, 0.514811996, 0, 0), X1_7 = c(0,
1.149134552, 0, 0, 0, 0), X1_8 = c(0.105799533, 0.386877662,
0, 0, 0, 0), X1_9 = c(0.763452684, 0.554554123, 0, 0, 0,
0), X1_10 = c(0.979400044, 0, 0, 0, 0, 0), X1_11 = c(1.120768885,
0.274641072, 0, 0, 0, 0.690696043), X1_12 = c(1, 0, 0, 0,
0, 0), X1_13 = c(1.276209448, 0, 0, 0, 0, 0), X1_14 = c(0.970143925,
0, 1, 0, 0, 0), X1_15 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_16 = c(0.85529218,
0, 0, 0, 0, 1), X1_17 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_18 = c(1.018244397,
0, 0, 0, 0, 0), X1_19 = c(2.138711024, 0, 0, 0, 0, 0), X1_20 = c(0L,
0L, 0L, 0L, 0L, 0L), X1_21 = c(1.554696031, 0, 0, 0, 0, 0
), X1_22 = c(0L, 0L, 0L, 0L, 0L, 0L), X1_23 = c(0L, 0L, 0L,
0L, 0L, 0L), X1_24 = c(1.681551744, 0, 1, 0, 0, 1.309303957
)), .Names = c("X", "Gene.name", "X1_1", "X1_2", "X1_3",
"X1_4", "X1_5", "X1_6", "X1_7", "X1_8", "X1_9", "X1_10", "X1_11",
"X1_12", "X1_13", "X1_14", "X1_15", "X1_16", "X1_17", "X1_18",
"X1_19", "X1_20", "X1_21", "X1_22", "X1_23", "X1_24"), row.names = c(NA,
6L), class = "data.frame")