我正在尝试在R中合并两个data.frames
d1 <- data.frame(Id=1:3,Name=c("Yann","Anne","Sabri"),Age=c(21,19,31),Height=c(178,169,192),Grade=c(15,12,18))
d2 <- data.frame(Id=c(1,3,4),Name=c("Yann","Sabri","Jui"),Age=c(28,21,15),Sex=c("M","M","F"),City=c("Paris","Paris","Toulouse"))
我想按Id
合并,并在其中仅保留Id
,Name
,Age
,Sex
和Grade
列最后的data.frame
。
我想出了一个很长的代码来完成这项工作,但是还有更好的方法吗?
dm <- data.frame(Id=unique(c(d1$Id,d2$Id)))
dm.d1.rows <- sapply(dm$Id, match, table = d1$Id)
dm.d2.rows <- sapply(dm$Id, match, table = d2$Id)
for(i in c("Name", "Age","Sex","Grade")) {
if(i %in% colnames(d1) && is.factor(d1[[i]]) || i %in% colnames(d2) && is.factor(d2[[i]])) dm[[i]]<- factor(rep(NA,nrow(dm)),
levels=unique(c(levels(d1[[i]]),levels(d2[[i]]))))
else dm[[i]]<- rep(NA,nrow(dm))
if(i %in% colnames(d1)) dm[[i]][!is.na(dm.d1.rows)] <- d1[[i]][na.exclude(dm.d1.rows)]
if(i %in% colnames(d2)) dm[[i]][!is.na(dm.d2.rows)] <- d2[[i]][na.exclude(dm.d2.rows)]
}
答案 0 :(得分:5)
这是通过tidyverse使用功能coalesce
的想法。此函数基本上将NA
的值替换为另一(指定)列的值。 -您可以找到功能coalesce
here
coalesce
的官方文档::给定一组向量,Coalesce()会找到每个位置的第一个非缺失值。这是受SQL COALESCE函数启发的,该函数对NULL执行相同的操作。
library(tidyverse)
d1 %>%
full_join(d2, by = c('Id', 'Name')) %>%
mutate(Age = coalesce(Age.x, Age.y)) %>%
select(Id, Name, Age, Sex, Grade)
给出,
Id Name Age Sex Grade 1 1 Yann 21 M 15 2 2 Anne 19 <NA> 12 3 3 Sabri 31 M 18 4 4 Jui 15 F NA
类似地,以data.table语法,
library(data.table)
#Convert to data.tables
d1_t <- setDT(d1)
d2_t <- setDT(d2)
merge(d1_t, d2_t, by = c('Id', 'Name'), all = TRUE)[,
Age := ifelse(is.na(Age.x), Age.y, Age.x)][,
c('Age.x', 'Age.y', 'City', 'Height') := NULL][]
给出,
Id Name Grade Sex Age 1: 1 Yann 15 M 21 2: 2 Anne 12 <NA> 19 3: 3 Sabri 18 M 31 4: 4 Jui NA F 15
答案 1 :(得分:2)
我个人是sqldf
的忠实拥护者,它允许您使用SQL查询来创建/操作数据帧。在您的情况下,下面的语句应该可以解决问题。
d1 <- data.frame(Id=1:3,Name=c("Yann","Anne","Sabri"),Age=c(21,19,31),
Height=c(178,169,192),Grade=c(15,12,18))
d2 <- data.frame(Id=c(1,3,4),Name=c("Yann","Sabri","Jui"),Age=c(28,21,15),
Sex=c("M","M","F"),City=c("Paris","Paris","Toulouse"))
d3 = sqldf("SELECT d1.Id, d1.Name, d1.Age, d2.Sex , d1.Grade
FROM d1
LEFT JOIN d2 ON d1.Id = d2.Id
UNION
SELECT d2.Id, d2.Name, coalesce(d1.Age, d2.Age) , d2.Sex, coalesce(d1.Grade, NULL)
FROM d2
LEFT JOIN d1 ON d2.Id = d1.Id")
特别是对于更复杂的数据帧合并/操作,使用sqldf
/ SQL可能会有用。
编辑:已使用{{1} / R环境修复SQL语句,结果如下表:
sqldf
答案 2 :(得分:2)
在基数R中:
d1 <- data.frame(Id=1:3,Name=c("Yann","Anne","Sabri"),Age=c(21,19,31),Height=c(178,169,192),Grade=c(15,12,18),stringsAsFactors = F)
d2 <- data.frame(Id=c(1,3,4),Name=c("Yann","Sabri","Jui"),Age=c(28,21,15),Sex=c("M","M","F"),City=c("Paris","Paris","Toulouse"),stringsAsFactors = F)
nms <- c("Id","Name", "Age", "Sex", "Grade")
. <- merge(d2,d1,all=TRUE,sort=FALSE)[nms]
aggregate(.,list(.$Id), function(x) c(na.omit(x),NA)[1])[-1]
# Id Name Age Sex Grade
# 1 1 Yann 28 M 15
# 2 2 Anne 19 <NA> 12
# 3 3 Sabri 21 M 18
# 4 4 Jui 15 F NA
请注意stringsAsFactors = F
,在应用此解决方案之前,您需要将因子转换为字符。
答案 3 :(得分:1)
这可能不是理想的答案,但这是使用sapply
的非合并,非联接选项,因为我们只想使用一列来合并两个数据帧
#Name the cols which you want in the final data frame
cols <- c("Id", "Name", "Age", "Sex","Grade")
#Get all unique id's
ids <- union(d1$Id, d2$Id)
#Loop over each ID
data.frame(t(sapply(ids, function(x) {
#Get indices in d1 where Id is present
d1inds <- d1$Id == x
#Get indices in d2 where Id is present
d2inds <- d2$Id == x
#If the Id is present in both d1 AND d2
if (any(d1inds) & any(d2inds))
#Combine d2 and d1 and select only cols column
#This is based on your expected output that in case if the ID is same
#we want to prefer Name and Age column from d2 rather than d1
return(cbind(d2[d2inds, ], d1[d1inds, ])[cols])
#If you want to prefer d1 over d2, we can do
#return(cbind(d1[d1inds, ], d2[d2inds, ])[cols])
#If the Id is present only in d1, add a "Sex" column with NA
if (any(d1inds))
return(cbind(d1[d1inds, ], "Sex" = NA)[cols])
#If the Id is present only in d2, add a "Grade" column with NA
else
return(cbind(d2[d2inds, ], "Grade" = NA)[cols])
})))
# Id Name Age Sex Grade
#1 1 Yann 28 M 15
#2 2 Anne 19 NA 12
#3 3 Sabri 21 M 18
#4 4 Jui 15 F NA
数据
d1 <- data.frame(Id=1:3,Name=c("Yann","Anne","Sabri"),Age=c(21,19,31),
Height=c(178,169,192),Grade=c(15,12,18), stringsAsFactors = FALSE)
d2 <- data.frame(Id=c(1,3,4),Name=c("Yann","Sabri","Jui"),Age=c(28,21,15),
Sex=c("M","M","F"),City=c("Paris","Paris","Toulouse"), stringsAsFactors = FALSE)
答案 4 :(得分:0)
您可以使用我的软件包safejoin,进行完全联接,并使用dplyr::coalesce
处理冲突。我们还使用dplyr::one_of
,所以我们不必手动选择列。
# devtools::install_github("moodymudskipper/safejoin")
library(safejoin)
keep <- c("Id", "Name", "Age", "Sex", "Grade")
safe_full_join(select(d1,one_of(keep)), select(d2,one_of(keep)),
by = c("Id","Name"), conflict = coalesce, check="")
# Id Name Age Grade Sex
# 1 1 Yann 21 15 M
# 2 2 Anne 19 12 <NA>
# 3 3 Sabri 31 18 M
# 4 4 Jui 15 NA F