如何编辑名称并跨越这两个数据框?

时间:2018-09-24 18:47:57

标签: r

我正在尝试基于文件名/访问权限来越过这两个data.frames,以便获得包含3列的一个:accession,d40和specie。由于_和.tab,文件名和访问权限略有不同。我试图找到一种方法来使名称同质并交叉两个表。有人可以帮我吗?谢谢!!

> head(me)
         filename      d40
1 Ames_13502_.tab 15714.88
2 Ames_16950_.tab 21633.59
3  Ames_1903_.tab 17167.95
4 Ames_19097_.tab 17041.48
5 Ames_19467_.tab 17198.14
6 Ames_19532_.tab 17307.25
> head(TM)
    specie   accesion
1 Teosinte Ames 13502
2 Teosinte Ames 21785
3 Teosinte Ames 21786
4 Teosinte Ames 21787
5 Teosinte Ames 21789
6 Teosinte Ames 21790

2 个答案:

答案 0 :(得分:1)

尝试gsub

(me$filename <- gsub("(Ames)_(\\d+)_\\.tab", "\\1 \\2", me$filename))
#[1] "Ames 13502" "Ames 16950" "Ames 1903"  "Ames 19097" "Ames 19467" "Ames 19532"

现在通过accesionfilename合并两个数据帧。

merge(TM, me, by.x = "accesion", by.y = "filename", all = TRUE)
#     accesion   specie      d40
#1  Ames 13502 Teosinte 15714.88
#2  Ames 16950     <NA> 21633.59
#3   Ames 1903     <NA> 17167.95
#4  Ames 19097     <NA> 17041.48
#5  Ames 19467     <NA> 17198.14
#6  Ames 19532     <NA> 17307.25
#7  Ames 21785 Teosinte       NA
#8  Ames 21786 Teosinte       NA
#9  Ames 21787 Teosinte       NA
#10 Ames 21789 Teosinte       NA
#11 Ames 21790 Teosinte       NA

数据

me <- structure(list(filename = c("Ames 13502", "Ames 16950", "Ames 1903", 
"Ames 19097", "Ames 19467", "Ames 19532"), d40 = c(15714.88, 
21633.59, 17167.95, 17041.48, 17198.14, 17307.25)), .Names = c("filename", 
"d40"), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")

TM <- structure(list(specie = c("Teosinte", "Teosinte", "Teosinte", 
"Teosinte", "Teosinte", "Teosinte"), accesion = c("Ames 13502", 
"Ames 21785", "Ames 21786", "Ames 21787", "Ames 21789", "Ames 21790"
)), .Names = c("specie", "accesion"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6"))

答案 1 :(得分:1)

使用dplyr

library(dplyr)

df1 %>%
  mutate(filename = trimws(gsub("_\\.tab|_", " ", filename))) %>%
  full_join(df2, by = c("filename" = "accesion"))

输出:

     filename      d40   specie
1  Ames 13502 15714.88 Teosinte
2  Ames 16950 21633.59     <NA>
3   Ames 1903 17167.95     <NA>
4  Ames 19097 17041.48     <NA>
5  Ames 19467 17198.14     <NA>
6  Ames 19532 17307.25     <NA>
7  Ames 21785       NA Teosinte
8  Ames 21786       NA Teosinte
9  Ames 21787       NA Teosinte
10 Ames 21789       NA Teosinte
11 Ames 21790       NA Teosinte

数据:

df1 <- structure(list(filename = structure(1:6, .Label = c("Ames_13502_.tab", 
"Ames_16950_.tab", "Ames_1903_.tab", "Ames_19097_.tab", "Ames_19467_.tab", 
"Ames_19532_.tab"), class = "factor"), d40 = c(15714.88, 21633.59, 
17167.95, 17041.48, 17198.14, 17307.25)), .Names = c("filename", 
"d40"), class = "data.frame", row.names = c("1", "2", "3", "4", 
"5", "6"))

df2 <- structure(list(specie = c("Teosinte", "Teosinte", "Teosinte", 
"Teosinte", "Teosinte", "Teosinte"), accesion = c("Ames 13502", 
"Ames 21785", "Ames 21786", "Ames 21787", "Ames 21789", "Ames 21790"
)), .Names = c("specie", "accesion"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6"))