我有两个数据框,看起来有点像:
Source_name <- c("name1", "name2", "name3", "name4", "name5")
Target_name <- c("name10", "name11", "name12", "name13", "name14")
values <- c("asd", "213", "kahsd", "a9u", "oau92")
values2 <- c("asdd", "oau892", "kahsd", "213", "213")
dat <- cbind(Source_name, values)
daf <- cbind(Target_name, values2)
dat
Source_name values
[1,] "name1" "asd"
[2,] "name2" "213"
[3,] "name3" "kahsd"
[4,] "name4" "a9u"
[5,] "name5" "oau92"
daf
Target_name values2
[1,] "name10" "asdd"
[2,] "name11" "oau892"
[3,] "name12" "kahsd"
[4,] "name13" "213"
[5,] "name14" "213"
每个值仅在dat中出现一次,但在daf中可能出现多次(或根本不出现)。我想根据desired_output data.frame在daf中记录最多出现一次的那些值。
unique_values <- c( "asd", "kahsd", "a9u", "oau92")
Source_name <- c( "name1", "name3", "name4", "name5")
Target_name <- c( "NA", "name12", "NA", "NA")
desired_output <- data.frame(cbind(unique_values, Source_name, Target_name))
desired_output
unique_values Source_name Target_name
1 asd name1
3 kahsd name3 name12
2 a9u name4
4 oau92 name5
我想有一种简单的方法可以使用apply或者其他东西,但我很难过。
答案 0 :(得分:3)
您可以合并两个data.frames:
dd <- merge(dat, daf, all.x = TRUE, by.x = "values", by.y = "values2")
dd
# values Source_name Target_name
# 1 213 name2 name13
# 2 213 name2 name14
# 3 a9u name4 <NA>
# 4 asd name1 <NA>
# 5 kahsd name3 name12
# 6 oau92 name5 <NA>
然后删除值显示两次或更多的行:
dd[unlist(Filter(function(x)length(x)<2, split(seq_len(nrow(dd)), dd$values))), ]
# values Source_name Target_name
# 3 a9u name4 <NA>
# 4 asd name1 <NA>
# 5 kahsd name3 name12
# 6 oau92 name5 <NA>
或者@hadley在评论中指出(谢谢!):
dd[ave(dd$values, dd$values, FUN = length) < 2, ]
答案 1 :(得分:2)
不是最优雅的解决方案:
dat <- as.data.frame(dat,stringsAsFactors=FALSE)
daf <- as.data.frame(daf,stringsAsFactors=FALSE)
fun <- function(x) {
n <- nrow(dat[daf[,2] == dat[Source_name==x,2],])
if (n == 0) res <- cbind(dat[Source_name==x,],"")
if (n == 1) res <- cbind(dat[Source_name==x,],daf[daf[,2]==dat[Source_name==x,2],1])
if (n > 1) res <- data.frame(character(0),character(0),character(0))
names(res) <- c("Source_name","unique_values","Target_name")
res[,c(2,1,3)]
}
do.call(rbind,lapply(dat[,1],fun))
unique_values Source_name Target_name
1 asd name1
2 kahsd name3 name12
3 a9u name4
4 oau92 name5