我有3个数据框,我想绑定在一起。其中两个有25列,一个有24个。在第一列中,您可以找到基因的名称,其余(24列)是值。 大多数情况下,第一列中的名称相同,但顺序可能不同。这意味着一个基因可以在第一个表中排在第二行,在另一个排在第2000行。
有没有办法绑定这些表并保留所有名称?
数据框nr 1.
> dput(head(tbl_gel1))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220",
"at1g01320", "at1g01420"), x1.1 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), x1.10 = c(NA, NA, 0.97940004406824,
NA, NA, NA), x1.11 = c(NA, 0.715595925164684, 1.12076888461521,
NA, 1, NA), x1.12 = c(NA, NA, 1, NA, 1, NA), x1.13 = c(NA, NA,
1.27620944815459, NA, 1.10617482362388, NA), x1.14 = c(NA, NA,
0.970143924518673, NA, 0.897284652612375, NA), x1.15 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.16 = c(NA,
NA, 0.855292180180481, NA, 0.678275003166569, NA), x1.17 = c(NA,
NA, NA, NA, 1.31361646343431, NA), x1.18 = c(NA, NA, 1.01824439729952,
NA, 0.731395183389585, NA), x1.19 = c(NA, NA, 2.13871102449867,
NA, 1.26860481661042, NA), x1.2 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), x1.20 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.21 = c(NA, NA, 1.5546960313129,
NA, 2.12826383499469, NA), x1.22 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), x1.23 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.24 = c(NA, 0.553801084127354,
1.68155174378018, NA, 1, NA), x1.3 = c(NA, 1.91253217984776,
NA, NA, NA, NA), x1.4 = c(NA, 1.2635979388975, NA, NA, NA, NA
), x1.5 = c(NA, 0.997262468935362, NA, NA, NA, 1), x1.6 = c(NA,
0.836333481838468, 0.186450525168714, NA, NA, 1), x1.7 = c(0.713761294385108,
0.998433283631924, NA, NA, NA, NA), x1.8 = c(NA, 1.00273753106464,
0.105799532964898, NA, NA, NA), x1.9 = c(1.14311935280745, 0.720766625421293,
0.763452683687036, 1, NA, NA)), .Names = c("X", "x1.1", "x1.10",
"x1.11", "x1.12", "x1.13", "x1.14", "x1.15", "x1.16", "x1.17",
"x1.18", "x1.19", "x1.2", "x1.20", "x1.21", "x1.22", "x1.23",
"x1.24", "x1.3", "x1.4", "x1.5", "x1.6", "x1.7", "x1.8", "x1.9"
), row.names = c(NA, 6L), class = "data.frame")
数据框nr 2.
> dput(head(tbl_gel2))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220",
"at1g01320", "at1g01710"), x1.25 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), x1.26 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.27 = c(NA, 1.27850867927992,
NA, NA, NA, NA), x1.28 = c(NA, 2.21463917401264, NA, NA, NA,
NA), x1.29 = c(NA, 0.953489798239202, NA, NA, NA, NA), x1.30 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.31 = c(0.601097045390844,
1.19887070873732, NA, NA, NA, NA), x1.32 = c(1.09252174468455,
0.517510161235958, 0.615226728434775, 0.542879255811352, NA,
NA), x1.33 = c(1, 0.419630322520778, 1, 1, NA, NA), x1.34 = c(NA,
NA, 0.799097170720151, NA, NA, NA), x1.35 = c(1, 0.585395967425093,
1.0988581226183, 1, 0.850838480887727, 1), x1.36 = c(0.267978732865144,
NA, 0.815842027204421, NA, NA, NA), x1.37 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.38 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.39 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.40 = c(NA,
0.253595454669352, 1, NA, 1, NA), x1.41 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.42 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.43 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.44 = c(NA,
0.801129291262679, 1.39524715613774, NA, 1, 1.34188880143232),
x1.45 = c(NA, NA, NA, NA, 3.10996537363054, NA), x1.46 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.48 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("X",
"x1.25", "x1.26", "x1.27", "x1.28", "x1.29", "x1.30", "x1.31",
"x1.32", "x1.33", "x1.34", "x1.35", "x1.36", "x1.37", "x1.38",
"x1.39", "x1.40", "x1.41", "x1.42", "x1.43", "x1.44", "x1.45",
"x1.46", "x1.48"), row.names = c(NA, 6L), class = "data.frame")
数据框nr 3.
> dput(head(tbl_gel3))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220",
"at1g01320", "at1g01420"), x1.49 = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), x1.50 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.51 = c(NA, 1, NA,
NA, NA, NA), x1.52 = c(NA, 1.7994810956534, NA, NA, NA, NA),
x1.53 = c(NA, 1, NA, NA, NA, 1), x1.54 = c(NA, 7.89402612997038,
NA, NA, NA, NA), x1.55 = c(0.920776942793063, 0.996320522101043,
0.254584439603907, NA, NA, NA), x1.56 = c(1, 0.729758385900956,
0.300151773873743, NA, NA, NA), x1.57 = c(1, 0.947723222879912,
0.948619033067299, 1, NA, NA), x1.58 = c(1, 0.928854762925871,
1.3235617264432, 0.785944656498542, 0.675641973487141, NA
), x1.59 = c(1.06908415906789, 0.634382162824105, 1.04395304578544,
1, 0.650651881343625, NA), x1.60 = c(1.80853320689787, NA,
0.880820179658551, NA, NA, NA), x1.61 = c(1, NA, 1.6718152409295,
1.09278053029295, 1.01060798973004, NA), x1.62 = c(0.704459686809266,
NA, 1, NA, 1.08123985492291, NA), x1.63 = c(0.629128718440608,
0.445252633504756, 0.675960340502994, NA, 1, NA), x1.64 = c(0.171185393355124,
0.884594994748168, 1, NA, 1.08954220349952, NA), x1.65 = c(NA,
NA, 1.11460636151774, NA, NA, NA), x1.66 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.67 = c(NA, NA,
NA, NA, 10.2238567979379, NA), x1.68 = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), x1.69 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.70 = c(NA,
NA, 2.0577136925345, NA, 3.60392205648014, NA), x1.71 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.72 = c(NA,
NA, 10.9845898205719, NA, NA, NA)), .Names = c("X", "x1.49",
"x1.50", "x1.51", "x1.52", "x1.53", "x1.54", "x1.55", "x1.56",
"x1.57", "x1.58", "x1.59", "x1.60", "x1.61", "x1.62", "x1.63",
"x1.64", "x1.65", "x1.66", "x1.67", "x1.68", "x1.69", "x1.70",
"x1.71", "x1.72"), row.names = c(NA, 6L), class = "data.frame")
作为输出,我希望有一个包含73列的数据框(一个带有基因名称,其余72个带有值)。例如,如果仅在一个数据帧中表示1个基因,我想将NA放在特定列中。不想失去任何基因。
我试图通过循环绑定它们但我一直都失败了。所以我决定在这里问:
## Getting the list of all accessions
list_of_data = lapply(tbl, read.csv)
all_data = do.call(rbind.fill, list_of_data)
## Getting the proper names
vec_names <- all_data[,1]
vec_names <- unique(vec_names)
#### Loop to bind all of the data
gdata = lapply(list_of_data,function(x) x[3:nrow(x),2:25])
for( i in 1:length(list_of_data)){
rownames(gdata[[i]]) = list_of_data[[i]][3:nrow(list_of_data[[i]]),1]
}
tmp = lapply(gdata,function(x) matrix(x),ncol=24)
final.table1=c()
for(i in 1:length(vec_names)){
print(i)
tmp=vec_names[i]
f1 = function(x) {x[tmp,]}
tmp2 = lapply(gdata,f1)
tmp3 = c()
for(j in 1:length(tmp2)){
tmp3=rbind(tmp3,tmp2[[j]])
}
tmp4 = as.vector(t(tmp3))
final.table1 = rbind(final.table1,tmp4)
}
rownames(final.table1) = vec_names
我不是这个循环的忠实粉丝,所以欢迎将它们组合在一起的任何其他方式。我仍然会尝试玩这个循环......
答案 0 :(得分:1)
双merge
可以解决问题:
merge(merge(tbl_gel1, tbl_gel2, by="X", all=TRUE), tbl_gel3, by="X", all=TRUE)
如果您有大量数据框,则可以使用Reduce
:
Reduce(
function(a, b) merge(a, b, by="X", all=TRUE),
list(tbl_gel1, tbl_gel2, tbl_gel3)
)
如果merge
无效,您可以查看join
包的plyr
功能。