我有两个data.frames
:
首先:
> dput(head(tbl_mz))
structure(list(m.z = c(258.1686969, 258.168752, 587.8313625,
587.8425292, 523.2863282, 523.2859396), Measured.mass = c(514.3228408,
514.3229511, 1173.648172, 802.4706732, 1272.645144, 1044.557326
)), .Names = c("m.z", "Measured.mass"), row.names = c(NA, 6L), class = "data.frame")
第二
> dput(head(tbl_exl))
structure(list(V1 = c(802.4706732, 1272.649209, 1272.646875,
1272.646599, 1272.646521, 1272.645144), V2 = c(NA, NA, NA, NA,
NA, NA), V3 = c(NA, NA, NA, NA, NA, NA), V4 = c(NA, NA, NA, NA,
NA, NA), V5 = c(NA, NA, NA, NA, NA, NA), V6 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("", "Positive"), class = "factor"),
V7 = c(28.7, 29.4, 29.4, 23.8, 28.6, 23.3), V8 = c(30.7,
31.4, 31.4, 25.8, 30.6, 25.3), X = c(NA, NA, NA, NA, NA,
NA), X.1 = c(NA, NA, NA, NA, NA, NA), X.2 = c(NA, NA, NA,
NA, NA, NA)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6",
"V7", "V8", "X", "X.1", "X.2"), row.names = c(NA, 6L), class = "data.frame")
我想将tbl_exl
,列V1
中的某些值替换为另一个表tbl_mz
中的值。列V1
(tbl_exl)中的值可以在Measured.mass
列(tbl_mz)中找到,它们应替换为m.z
数据中下一列tbl_mz
中的值帧。
换句话说,V1
中的值应替换为m.z
值。
问题是并非所有来自V1
的值都无法在其他数据框中找到。那些可以找到的东西可以删除,也可以像它们一样留下。
输出,我想得到:
> dput(head(tbl_exl_modified))
structure(list(V1 = c(587.8425292, 1272.649209, 1272.646875,
1272.646599, 1272.646521, 523.2863282), V2 = c(NA, NA, NA, NA,
NA, NA), V3 = c(NA, NA, NA, NA, NA, NA), V4 = c(NA, NA, NA, NA,
NA, NA), V5 = c(NA, NA, NA, NA, NA, NA), V6 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("", "Positive"), class = "factor"),
V7 = c(28.7, 29.4, 29.4, 23.8, 28.6, 23.3), V8 = c(30.7,
31.4, 31.4, 25.8, 30.6, 25.3), X = c(NA, NA, NA, NA, NA,
NA), X.1 = c(NA, NA, NA, NA, NA, NA), X.2 = c(NA, NA, NA,
NA, NA, NA)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6",
"V7", "V8", "X", "X.1", "X.2"), row.names = c(NA, 6L), class = "data.frame")
答案 0 :(得分:3)
您可以尝试match
。根据两个数据集的列(“Measured.mass”,“V1”)之间的match
创建数字索引。删除NA
值(“indx1”,“indxN1”)并根据这些索引将“V1”值替换为“m.z”。
indx <- match(tbl_mz$Measured.mass, tbl_exl$V1)
indx1 <- indx[!is.na(indx)]
indxN <- match(tbl_exl$V1, tbl_mz$Measured.mass)
indxN1 <- indxN[!is.na(indxN)]
tbl_exl$V1[indx1] <- tbl_mz$m.z[indxN1]
identical(tbl_exl, tbl_exl_modified)
#[1] TRUE
或使用left_join
dplyr
library(dplyr)
tbl_exl1 <- left_join(tbl_exl, tbl_mz, by=c('V1'='Measured.mass')) %>%
mutate(V1= pmax((NA^!is.na(m.z))*V1, m.z,
na.rm=TRUE)) %>%
select(-m.z)
tbl_exl1
# V1 V2 V3 V4 V5 V6 V7 V8 X X.1 X.2
#1 587.8425 NA NA NA NA Positive 28.7 30.7 NA NA NA
#2 1272.6492 NA NA NA NA Positive 29.4 31.4 NA NA NA
#3 1272.6469 NA NA NA NA Positive 29.4 31.4 NA NA NA
#4 1272.6466 NA NA NA NA Positive 23.8 25.8 NA NA NA
#5 1272.6465 NA NA NA NA Positive 28.6 30.6 NA NA NA
#6 523.2863 NA NA NA NA Positive 23.3 25.3 NA NA NA
答案 1 :(得分:3)
这是使用data.table
二进制连接
library(data.table)
setnames(setDT(tbl_exl), 1, "Measured.mass") # Changing the first column name for the join to work
setkey(tbl_exl, Measured.mass) # Keying tbl_exl by `Measured.mass`
setkey(setDT(tbl_mz), Measured.mass) # Keying tbl_exl by `Measured.mass`
tbl_exl[tbl_mz, Measured.mass := i.m.z][] # Joining and retrieving only matched values from `i.m.z`
# Measured.mass V2 V3 V4 V5 V6 V7 V8 X X.1 X.2
# 1: 587.8425 NA NA NA NA Positive 28.7 30.7 NA NA NA
# 2: 523.2863 NA NA NA NA Positive 23.3 25.3 NA NA NA
# 3: 1272.6465 NA NA NA NA Positive 28.6 30.6 NA NA NA
# 4: 1272.6466 NA NA NA NA Positive 23.8 25.8 NA NA NA
# 5: 1272.6469 NA NA NA NA Positive 29.4 31.4 NA NA NA
# 6: 1272.6492 NA NA NA NA Positive 29.4 31.4 NA NA NA