R中的条件交点

时间:2017-02-12 21:38:06

标签: r conditional dplyr intersection

希望过滤出两个表中的动物(交叉条件1)并在表格中共享同一类别中的相同大小(交叉条件2)。知道一种有效的编码方式 - 例如,使用dplyr?

library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"), 
                      size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
      #       type   size tableName
      # 1      cat  small   animal1
      # 2      dog  large   animal1
      # 3      dog  small   animal1
      # 4     bird  medium   animal1
      # 5 elephant  large   animal1

animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"), 
                      size = c("medium","large","large", "small", "large"), 
                      tableName = rep("animal2",5), stringsAsFactors = F)
      #      type   size tableName
      # 1 elephant medium   animal2
      # 2      dog  large   animal2
      # 3      dog  large   animal2
      # 4 elephant  small   animal2
      # 5 elephant  large   animal2


rbindAnimal <- rbind(animal1, animal2)
      #        type   size tableName
      # 1       cat  small   animal1
      # 2       dog  large   animal1
      # 3       dog  small   animal1
      # 4      bird medium   animal1
      # 5  elephant  large   animal1
      # 6  elephant medium   animal2
      # 7       dog  large   animal2
      # 8       dog  large   animal2
      # 9  elephant  small   animal2
      # 10 elephant  large   animal2

# Intersection across both tables
intersectType <- intersect( rbindAnimal %>% filter(tableName == "animal1") %>% select(type), 
                                          rbindAnimal %>% filter(tableName == "animal2") %>% select(type) ) 
      #     type
      # 1 elephant
      # 2      dog

rbindAnimal <- rbindAnimal[which(rbindAnimal$type %in% intersectType$type),]

      #        type   size tableName
      # 2       dog  large   animal1
      # 3       dog  small   animal1
      # 5  elephant  large   animal1
      # 6  elephant medium   animal2
      # 7       dog  large   animal2
      # 8       dog  large   animal2
      # 9  elephant  small   animal2
      # 10 elephant  large   animal2

# Needs to return row numbers! Here: 2,5,7,8, and 10
#        type   size tableName
# 2       dog  large   animal1
# 5  elephant  large   animal1
# 7       dog  large   animal2
# 8       dog  large   animal2
# 10 elephant  large   animal2

2 个答案:

答案 0 :(得分:1)

  

&#34;需要返回行号!&#34;

使用data.table中的.I非常简单,它存储行号:

library(data.table)
setDT(rbindAnimal)

w <- rbindAnimal[, if (uniqueN(tableName) > 1L) .I, by=.(type, size)]$V1
# [1]  2  7  8  5 10
rbindAnimal[-w]
#        type   size tableName
# 1:      cat  small   animal1
# 2:      dog  small   animal1
# 3:     bird medium   animal1
# 4: elephant medium   animal2
# 5: elephant  small   animal2

我们不是反加入(比如OP的回答),而是按行号排除行。

工作原理

  • uniqueN计算唯一值的数量。 OP的条件是(释义):

      

    两个表格中都会显示类型大小的组合。

    转换为

      uniqueN(tableName) > 1L行中的

    by=.(type, size)

  • 如果条件成立,则
  • if (cond) x给出x;和NULL否则,放弃小组。

dplyr variant

它也适用于dplyr(虽然我不知道如何获取行号):

rbindAnimal %>% group_by(type, size) %>% filter(n_distinct(tableName) == 1L)
#       type   size tableName
#      <chr>  <chr>     <chr>
# 1      cat  small   animal1
# 2      dog  small   animal1
# 3     bird medium   animal1
# 4 elephant medium   animal2
# 5 elephant  small   animal2

答案 1 :(得分:0)

解决方案:使用merge / semi_join / anti_join(感谢合并提示@Imo!)

library(dplyr)
animal1 <- data.frame(type = c("cat", "dog", "dog","bird", "elephant"), 
                      size = c("small","large","small", "medium", "large"), tableName = rep("animal1",5), stringsAsFactors = F)
      #       type   size tableName
      # 1      cat  small   animal1
      # 2      dog  large   animal1
      # 3      dog  small   animal1
      # 4     bird  medium   animal1
      # 5 elephant  large   animal1

animal2 <- data.frame(type = c("elephant", "dog", "dog", "elephant", "elephant"), 
                      size = c("medium","large","large", "small", "large"), 
                      tableName = rep("animal2",5), stringsAsFactors = F)
      #      type   size tableName
      # 1 elephant medium   animal2
      # 2      dog  large   animal2
      # 3      dog  large   animal2
      # 4 elephant  small   animal2
      # 5 elephant  large   animal2

rbindAnimal <- rbind(animal1, animal2)
mergedAnimals <- merge(animal1, animal2, by = c("type","size"), all = T)
sharedTypeSize <- mergedAnimals[complete.cases(mergedAnimals),] %>% select(type,size) %>% unique
sharedTypeSize <- merge(rbindAnimal, sharedTypeSize)

semi_join(rbindAnimal, sharedTypeSize)
      #        type  size tableName
      # 1      dog large   animal1
      # 2      dog large   animal2
      # 3      dog large   animal2
      # 4 elephant large   animal1
      # 5 elephant large   animal2

anti_join(rbindAnimal, sharedTypeSize)

      #       type   size tableName
      # 1      cat  small   animal1
      # 2      dog  small   animal1
      # 3     bird medium   animal1
      # 4 elephant medium   animal2
      # 5 elephant  small   animal2