比较数据表R中所有列的有效方法

时间:2019-06-19 07:48:18

标签: r dplyr data.table

我在R中有两个数据表,它们具有相同的列(数字,名称和顺序)和一个ID,如下所示:

library(data.table)
dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))

> dt1
   ids col1 col2
1:   1    A    B
2:   2    B    F
3:   5    F    G

> dt2
   ids col1 col2
1:   2    B    F
2:   1    A    G
3:   6    K    M
4:   5    L    G

我想为每一列知道多少个(公用)id具有相同的值。例如,对于col1,我们有:对于ID1,两个值都是A,对于ID2,两个值都是B,对于ID5,两个值是不同的,因此此列的最终结果是2。 我有以下解决方案:

joint_dt <- merge(dt1, dt2, by = "ids", suffixes = c("", "_old"))
comp_res <- mapply(function(x, y) sum(x == y), joint_dt[, 2:ncol(dt1)], joint_dt[, (ncol(dt1) + 1):ncol(joint_dt)])  

> comp_res
col1 col2 
   2    2 

这是做我想要的事情的最好方法,还是我缺少为此指定的某些软件包或功能?

5 个答案:

答案 0 :(得分:3)

另一种方法是使用内部联接来获得结果:

sapply(c(col1="col1",col2="col2"), function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])

输出:

col1 col2 
   2    2 

如果有人有兴趣计时代码,这里是示例数据(此处没有tidyverse计时)

library(data.table)
set.seed(0L)
nr <- 1e6L
nc <- 2L
nids <- nr/100
dt1 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt1, names(dt1), gsub("^V", "col", names(dt1)))
dt2 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt2, names(dt2), gsub("^V", "col", names(dt2)))

data.table解决方案的一些时间安排:

计时代码:

library(microbenchmark)
microbenchmark(
    mtd0={
            cols <- structure(paste0("col", seq_len(nc)), names=paste0("col", seq_len(nc)))
            sapply(cols, function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])
        },
    mtd1=melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][
        !is.na(ids2), .N, by = variable],
    times=3L)

时间:

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval cld
 mtd0  179.4386  186.3906  195.6833  193.3425  203.8057  214.2689     3  a 
 mtd1 8306.7968 8373.2351 8467.4561 8439.6734 8547.7858 8655.8982     3   b

答案 1 :(得分:1)

一种在熔融数据表上使用联接的方法

melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][!is.na(ids2), .N, by = variable][]

   variable N
1:     col1 2
2:     col2 2

答案 2 :(得分:1)

另一种tidyverse方法:

library(tidyverse)
library(data.table)

dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))

dt1 %>% gather(col,value1,-ids) %>%                                   # reshape dt1
  inner_join(dt2 %>% gather(col,value2,-ids), by=c("ids","col")) %>%  # reshape dt2 and join
  group_by(col) %>%                                                   # for each col value
  summarise(res = sum(value1 == value2))                              # count matches

# # A tibble: 2 x 2
#    col    res
#   <chr> <int>
# 1 col1      2
# 2 col2      2

答案 3 :(得分:0)

一种tidyverse可能是:

dt2 %>%
 inner_join(dt1, by = c("ids" = "ids")) %>%
 gather(var, val, -ids) %>%
 separate(var, c("var", "temp")) %>%
 count(ids, var, val) %>%
 group_by(var) %>%
 summarise(n = length(n[n > 1])) %>%
 ungroup()

  var       n
  <chr> <int>
1 col1      2
2 col2      2

答案 4 :(得分:0)

我认为map的{​​{1}}与purrr的过滤联接semi_join返回两个df中都存在的行的完美结合。

dplyr

结果

library(purrr)
library(dplyr)

map_dfc(c("col1", "col2"),
        ~dt1 %>% 
          semi_join(dt2 %>% select("ids", .x)) %>% 
          summarise(!!.x := n()))