绑定3个数据帧只有部分相同的名称

时间:2014-05-19 09:58:56

标签: r

我有3个数据框,我想绑定在一起。其中两个有25列,一个有24个。在第一列中,您可以找到基因的名称,其余(24列)是值。 大多数情况下,第一列中的名称相同,但顺序可能不同。这意味着一个基因可以在第一个表中排在第二行,在另一个排在第2000行。

有没有办法绑定这些表并保留所有名称?

数据框nr 1.

> dput(head(tbl_gel1))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220", 
"at1g01320", "at1g01420"), x1.1 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.10 = c(NA, NA, 0.97940004406824, 
NA, NA, NA), x1.11 = c(NA, 0.715595925164684, 1.12076888461521, 
NA, 1, NA), x1.12 = c(NA, NA, 1, NA, 1, NA), x1.13 = c(NA, NA, 
1.27620944815459, NA, 1.10617482362388, NA), x1.14 = c(NA, NA, 
0.970143924518673, NA, 0.897284652612375, NA), x1.15 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.16 = c(NA, 
NA, 0.855292180180481, NA, 0.678275003166569, NA), x1.17 = c(NA, 
NA, NA, NA, 1.31361646343431, NA), x1.18 = c(NA, NA, 1.01824439729952, 
NA, 0.731395183389585, NA), x1.19 = c(NA, NA, 2.13871102449867, 
NA, 1.26860481661042, NA), x1.2 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.20 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.21 = c(NA, NA, 1.5546960313129, 
NA, 2.12826383499469, NA), x1.22 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.23 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.24 = c(NA, 0.553801084127354, 
1.68155174378018, NA, 1, NA), x1.3 = c(NA, 1.91253217984776, 
NA, NA, NA, NA), x1.4 = c(NA, 1.2635979388975, NA, NA, NA, NA
), x1.5 = c(NA, 0.997262468935362, NA, NA, NA, 1), x1.6 = c(NA, 
0.836333481838468, 0.186450525168714, NA, NA, 1), x1.7 = c(0.713761294385108, 
0.998433283631924, NA, NA, NA, NA), x1.8 = c(NA, 1.00273753106464, 
0.105799532964898, NA, NA, NA), x1.9 = c(1.14311935280745, 0.720766625421293, 
0.763452683687036, 1, NA, NA)), .Names = c("X", "x1.1", "x1.10", 
"x1.11", "x1.12", "x1.13", "x1.14", "x1.15", "x1.16", "x1.17", 
"x1.18", "x1.19", "x1.2", "x1.20", "x1.21", "x1.22", "x1.23", 
"x1.24", "x1.3", "x1.4", "x1.5", "x1.6", "x1.7", "x1.8", "x1.9"
), row.names = c(NA, 6L), class = "data.frame")

数据框nr 2.

> dput(head(tbl_gel2))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220", 
"at1g01320", "at1g01710"), x1.25 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.26 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.27 = c(NA, 1.27850867927992, 
NA, NA, NA, NA), x1.28 = c(NA, 2.21463917401264, NA, NA, NA, 
NA), x1.29 = c(NA, 0.953489798239202, NA, NA, NA, NA), x1.30 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.31 = c(0.601097045390844, 
1.19887070873732, NA, NA, NA, NA), x1.32 = c(1.09252174468455, 
0.517510161235958, 0.615226728434775, 0.542879255811352, NA, 
NA), x1.33 = c(1, 0.419630322520778, 1, 1, NA, NA), x1.34 = c(NA, 
NA, 0.799097170720151, NA, NA, NA), x1.35 = c(1, 0.585395967425093, 
1.0988581226183, 1, 0.850838480887727, 1), x1.36 = c(0.267978732865144, 
NA, 0.815842027204421, NA, NA, NA), x1.37 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.38 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.39 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.40 = c(NA, 
0.253595454669352, 1, NA, 1, NA), x1.41 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.42 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.43 = c(NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.44 = c(NA, 
0.801129291262679, 1.39524715613774, NA, 1, 1.34188880143232), 
    x1.45 = c(NA, NA, NA, NA, 3.10996537363054, NA), x1.46 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.48 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), .Names = c("X", 
"x1.25", "x1.26", "x1.27", "x1.28", "x1.29", "x1.30", "x1.31", 
"x1.32", "x1.33", "x1.34", "x1.35", "x1.36", "x1.37", "x1.38", 
"x1.39", "x1.40", "x1.41", "x1.42", "x1.43", "x1.44", "x1.45", 
"x1.46", "x1.48"), row.names = c(NA, 6L), class = "data.frame")

数据框nr 3.

> dput(head(tbl_gel3))
structure(list(X = c("at1g01050", "at1g01080", "at1g01090", "at1g01220", 
"at1g01320", "at1g01420"), x1.49 = c(NA_real_, NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_), x1.50 = c(NA_real_, NA_real_, 
NA_real_, NA_real_, NA_real_, NA_real_), x1.51 = c(NA, 1, NA, 
NA, NA, NA), x1.52 = c(NA, 1.7994810956534, NA, NA, NA, NA), 
    x1.53 = c(NA, 1, NA, NA, NA, 1), x1.54 = c(NA, 7.89402612997038, 
    NA, NA, NA, NA), x1.55 = c(0.920776942793063, 0.996320522101043, 
    0.254584439603907, NA, NA, NA), x1.56 = c(1, 0.729758385900956, 
    0.300151773873743, NA, NA, NA), x1.57 = c(1, 0.947723222879912, 
    0.948619033067299, 1, NA, NA), x1.58 = c(1, 0.928854762925871, 
    1.3235617264432, 0.785944656498542, 0.675641973487141, NA
    ), x1.59 = c(1.06908415906789, 0.634382162824105, 1.04395304578544, 
    1, 0.650651881343625, NA), x1.60 = c(1.80853320689787, NA, 
    0.880820179658551, NA, NA, NA), x1.61 = c(1, NA, 1.6718152409295, 
    1.09278053029295, 1.01060798973004, NA), x1.62 = c(0.704459686809266, 
    NA, 1, NA, 1.08123985492291, NA), x1.63 = c(0.629128718440608, 
    0.445252633504756, 0.675960340502994, NA, 1, NA), x1.64 = c(0.171185393355124, 
    0.884594994748168, 1, NA, 1.08954220349952, NA), x1.65 = c(NA, 
    NA, 1.11460636151774, NA, NA, NA), x1.66 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), x1.67 = c(NA, NA, 
    NA, NA, 10.2238567979379, NA), x1.68 = c(NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_), x1.69 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.70 = c(NA, 
    NA, 2.0577136925345, NA, 3.60392205648014, NA), x1.71 = c(NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), x1.72 = c(NA, 
    NA, 10.9845898205719, NA, NA, NA)), .Names = c("X", "x1.49", 
"x1.50", "x1.51", "x1.52", "x1.53", "x1.54", "x1.55", "x1.56", 
"x1.57", "x1.58", "x1.59", "x1.60", "x1.61", "x1.62", "x1.63", 
"x1.64", "x1.65", "x1.66", "x1.67", "x1.68", "x1.69", "x1.70", 
"x1.71", "x1.72"), row.names = c(NA, 6L), class = "data.frame")

作为输出,我希望有一个包含73列的数据框(一个带有基因名称,其余72个带有值)。例如,如果仅在一个数据帧中表示1个基因,我想将NA放在特定列中。不想失去任何基因。

我试图通过循环绑定它们但我一直都失败了。所以我决定在这里问:

## Getting the list of all accessions
list_of_data = lapply(tbl, read.csv)
all_data = do.call(rbind.fill, list_of_data)


## Getting the proper names
vec_names <- all_data[,1]
vec_names <- unique(vec_names)

#### Loop to bind all of the data

gdata = lapply(list_of_data,function(x) x[3:nrow(x),2:25])

for( i in 1:length(list_of_data)){
  rownames(gdata[[i]]) = list_of_data[[i]][3:nrow(list_of_data[[i]]),1]
}
tmp = lapply(gdata,function(x) matrix(x),ncol=24)




final.table1=c()
for(i in 1:length(vec_names)){
  print(i)
  tmp=vec_names[i]
  f1 = function(x) {x[tmp,]}
  tmp2 = lapply(gdata,f1)
  tmp3 = c()
  for(j in 1:length(tmp2)){
    tmp3=rbind(tmp3,tmp2[[j]])
  }
  tmp4 = as.vector(t(tmp3))
  final.table1 = rbind(final.table1,tmp4)
}

rownames(final.table1) = vec_names

我不是这个循环的忠实粉丝,所以欢迎将它们组合在一起的任何其他方式。我仍然会尝试玩这个循环......

1 个答案:

答案 0 :(得分:1)

merge可以解决问题:

merge(merge(tbl_gel1, tbl_gel2, by="X", all=TRUE), tbl_gel3, by="X", all=TRUE)

如果您有大量数据框,则可以使用Reduce

完成所有数据框
Reduce(
    function(a, b) merge(a, b, by="X", all=TRUE),
    list(tbl_gel1, tbl_gel2, tbl_gel3)
)

如果merge无效,您可以查看join包的plyr功能。