希望过滤出两个表中的动物(交叉条件1)并在表格中共享同一类别中的相同大小(交叉条件2)。知道一种有效的编码方式 - 例如,使用dplyr?
library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"),
size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"),
size = c("medium","large","large", "small", "large"),
tableName = rep("animal2",5), stringsAsFactors = F)
# type size tableName
# 1 elephant medium animal2
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant small animal2
# 5 elephant large animal2
rbindAnimal <- rbind(animal1, animal2)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
# 6 elephant medium animal2
# 7 dog large animal2
# 8 dog large animal2
# 9 elephant small animal2
# 10 elephant large animal2
# Intersection across both tables
intersectType <- intersect( rbindAnimal %>% filter(tableName == "animal1") %>% select(type),
rbindAnimal %>% filter(tableName == "animal2") %>% select(type) )
# type
# 1 elephant
# 2 dog
rbindAnimal <- rbindAnimal[which(rbindAnimal$type %in% intersectType$type),]
# type size tableName
# 2 dog large animal1
# 3 dog small animal1
# 5 elephant large animal1
# 6 elephant medium animal2
# 7 dog large animal2
# 8 dog large animal2
# 9 elephant small animal2
# 10 elephant large animal2
# Needs to return row numbers! Here: 2,5,7,8, and 10
# type size tableName
# 2 dog large animal1
# 5 elephant large animal1
# 7 dog large animal2
# 8 dog large animal2
# 10 elephant large animal2
答案 0 :(得分:1)
&#34;需要返回行号!&#34;
使用data.table中的.I
非常简单,它存储行号:
library(data.table)
setDT(rbindAnimal)
w <- rbindAnimal[, if (uniqueN(tableName) > 1L) .I, by=.(type, size)]$V1
# [1] 2 7 8 5 10
rbindAnimal[-w]
# type size tableName
# 1: cat small animal1
# 2: dog small animal1
# 3: bird medium animal1
# 4: elephant medium animal2
# 5: elephant small animal2
我们不是反加入(比如OP的回答),而是按行号排除行。
工作原理
uniqueN
计算唯一值的数量。 OP的条件是(释义):
两个表格中都会显示类型大小的组合。
转换为
uniqueN(tableName) > 1L
行中的
by=.(type, size)
。
if (cond) x
给出x
;和NULL
否则,放弃小组。
dplyr variant
它也适用于dplyr(虽然我不知道如何获取行号):
rbindAnimal %>% group_by(type, size) %>% filter(n_distinct(tableName) == 1L)
# type size tableName
# <chr> <chr> <chr>
# 1 cat small animal1
# 2 dog small animal1
# 3 bird medium animal1
# 4 elephant medium animal2
# 5 elephant small animal2
答案 1 :(得分:0)
解决方案:使用merge / semi_join / anti_join(感谢合并提示@Imo!)
library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"),
size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
# type size tableName
# 1 cat small animal1
# 2 dog large animal1
# 3 dog small animal1
# 4 bird medium animal1
# 5 elephant large animal1
animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"),
size = c("medium","large","large", "small", "large"),
tableName = rep("animal2",5), stringsAsFactors = F)
# type size tableName
# 1 elephant medium animal2
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant small animal2
# 5 elephant large animal2
rbindAnimal <- rbind(animal1, animal2)
mergedAnimals <- merge(animal1, animal2, by = c("type","size"), all = T)
sharedTypeSize <- mergedAnimals[complete.cases(mergedAnimals),] %>% select(type,size) %>% unique
sharedTypeSize <- merge(rbindAnimal, sharedTypeSize)
semi_join(rbindAnimal, sharedTypeSize)
# type size tableName
# 1 dog large animal1
# 2 dog large animal2
# 3 dog large animal2
# 4 elephant large animal1
# 5 elephant large animal2
anti_join(rbindAnimal, sharedTypeSize)
# type size tableName
# 1 cat small animal1
# 2 dog small animal1
# 3 bird medium animal1
# 4 elephant medium animal2
# 5 elephant small animal2