Question

我在R中有两个数据表，它们具有相同的列（数字，名称和顺序）和一个ID，如下所示：

library(data.table)
dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))

> dt1
   ids col1 col2
1:   1    A    B
2:   2    B    F
3:   5    F    G

> dt2
   ids col1 col2
1:   2    B    F
2:   1    A    G
3:   6    K    M
4:   5    L    G

我想为每一列知道多少个（公用）id具有相同的值。例如，对于col1，我们有：对于ID1，两个值都是A，对于ID2，两个值都是B，对于ID5，两个值是不同的，因此此列的最终结果是2。我有以下解决方案：

joint_dt <- merge(dt1, dt2, by = "ids", suffixes = c("", "_old"))
comp_res <- mapply(function(x, y) sum(x == y), joint_dt[, 2:ncol(dt1)], joint_dt[, (ncol(dt1) + 1):ncol(joint_dt)])  

> comp_res
col1 col2 
   2    2

这是做我想要的事情的最好方法，还是我缺少为此指定的某些软件包或功能？

Answer 1

另一种方法是使用内部联接来获得结果：

sapply(c(col1="col1",col2="col2"), function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])

输出：

col1 col2 
   2    2

如果有人有兴趣计时代码，这里是示例数据（此处没有tidyverse计时）

library(data.table)
set.seed(0L)
nr <- 1e6L
nc <- 2L
nids <- nr/100
dt1 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt1, names(dt1), gsub("^V", "col", names(dt1)))
dt2 <- as.data.table(matrix(sample(nids, nr*nc, replace=TRUE), ncol=nc))[, ids := 1:nr]
setnames(dt2, names(dt2), gsub("^V", "col", names(dt2)))

data.table解决方案的一些时间安排：

计时代码：

library(microbenchmark)
microbenchmark(
    mtd0={
            cols <- structure(paste0("col", seq_len(nc)), names=paste0("col", seq_len(nc)))
            sapply(cols, function(x) dt1[dt2, on=c("ids", x), nomatch=0L, .N])
        },
    mtd1=melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][
        !is.na(ids2), .N, by = variable],
    times=3L)

时间：

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval cld
 mtd0  179.4386  186.3906  195.6833  193.3425  203.8057  214.2689     3  a 
 mtd1 8306.7968 8373.2351 8467.4561 8439.6734 8547.7858 8655.8982     3   b

Answer 2

一种在熔融数据表上使用联接的方法

melt(dt1, id.vars = "ids")[ melt(dt2, id.vars = "ids"), ids2 := i.ids, on = .(variable, value)][!is.na(ids2), .N, by = variable][]

   variable N
1:     col1 2
2:     col2 2

Answer 3

另一种tidyverse方法：

library(tidyverse)
library(data.table)

dt1 <- data.table(ids = c(1, 2, 5), col1 = c("A", "B", "F"), col2 = c("B", "F", "G"))
dt2 <- data.table(ids = c(2, 1, 6, 5), col1 = c("B", "A", "K", "L"), col2 = c("F", "G", "M", "G"))

dt1 %>% gather(col,value1,-ids) %>%                                   # reshape dt1
  inner_join(dt2 %>% gather(col,value2,-ids), by=c("ids","col")) %>%  # reshape dt2 and join
  group_by(col) %>%                                                   # for each col value
  summarise(res = sum(value1 == value2))                              # count matches

# # A tibble: 2 x 2
#    col    res
#   <chr> <int>
# 1 col1      2
# 2 col2      2

Answer 4

一种tidyverse可能是：

dt2 %>%
 inner_join(dt1, by = c("ids" = "ids")) %>%
 gather(var, val, -ids) %>%
 separate(var, c("var", "temp")) %>%
 count(ids, var, val) %>%
 group_by(var) %>%
 summarise(n = length(n[n > 1])) %>%
 ungroup()

  var       n
  <chr> <int>
1 col1      2
2 col2      2

Answer 5

我认为map的{{1}}与purrr的过滤联接semi_join返回两个df中都存在的行的完美结合。

dplyr

结果

library(purrr)
library(dplyr)

map_dfc(c("col1", "col2"),
        ~dt1 %>% 
          semi_join(dt2 %>% select("ids", .x)) %>% 
          summarise(!!.x := n()))

比较数据表R中所有列的有效方法

5 个答案: