Question

我有两个表，并希望得到输出t1，其中添加了一列，给出了t2中存在（id，category）的行数。以下是小数据集的示例：

id = c(12,12,13,14,15)
category = c(101,104,102,101,102)
t1 = cbind(id, category)

id = c(12,12,13,13,12,15)
category = c(101,102,101,104,101,102)
t2 = cbind(id, category)

> t1
     id category
[1,] 12      101
[2,] 12      104
[3,] 13      102
[4,] 14      101
[5,] 15      102
> t2
     id category
[1,] 12      101
[2,] 12      102
[3,] 13      101
[4,] 13      104
[5,] 12      101
[6,] 15      102

我希望在t2 中获取t1（id，类别）更新的t1：

> t1
      id  category  count_id_cat_in_t2
[1,]  12       101  2        # because (12,101) appears 2 times in t2
[2,]  12       104  0        # because (12,104) appears 0 times in t2
[3,]  13       102  0        # etc
[4,]  14       101  0
[5,]  15       102  1

由于我使用大型数据集，我需要一个解决方案，如果可能的话，可以在5分钟内完成大数据集：

t1有300千行
t2有1500万行

我在MySQL基于this answer工作，但无法使用sqldf将其移植到R中。我在Windows上工作，无法获得使用sqldf的MySQL命令所需的库。

Answer 1

来自beginnR的dplyr解决方案有效，但整个数据集耗时太长（我在50分钟后停止了它）。

我找到了一个使用数据表二进制搜索的快速解决方案：

id = c(12,12,13,14,15)
category = c(101,104,102,101,102)
t1 = data.frame(id, category)

id = c(12,12,13,13,12,15)
category = c(101,102,101,104,101,102)
t2 = data.frame(id, category)

library(data.table)
t2 = data.table(t2)
setkey(t2, id, category) # setting key, preparing for a FAST binary search

n = dim(t1)[1]
counts <- array(NA, n)

for(i in(1:n))
{
    id_ = t1$id[i]
    category_ = t1$category[i]

    counts[i] = dim(t2[J(id_, category_),nomatch=0])[1] # data table FAST binary search
}

t1$count_id_cat_in_tr = counts

对于大数据集，这在5/10分钟内有效（t1：150万行，t2：1500万行）。

使用data.table的另一种方式：

require(data.table) ## 1.9.2
setDT(t1)
setkey(setDT(t2), id, category)
ans = t2[t1, .N]

# or a little faster
ans = t2[t1][, .N, by="id,category"]

Answer 2

1）这是一个sqldf解决方案：

sqldf(c("create index i on df2(id, category)", "select A.*, coalesce(count, 0) count
       from df1 A 
       left natural join 
          (select *, count(*) count from main.df2 group by id, category)"))

，并提供：

  id category count
1 12      101     2
2 12      104     0
3 13      102     0
4 14      101     0
5 15      102     1

如果速度不是问题，则省略索引，在这种情况下，用main.df2替换df2。

2） RMySQL包在Windows下运行。我在Windows下使用RMySQL和sqldf运行以下命令。有关将sqldf与MySQL一起使用的更多信息，请参阅?sqldf。

library(sqldf)
library(RMySQL)
sqldf(c("create index i on df2(id, category)", 
      "select A.*, coalesce(kount, 0) count
       from df1 A 
       left join 
          (select *, count(*) kount from df2 group by id, category) B
       on A.id = B.id and A.category = B.category"))

或使用您已经使用的SQL语句。这给出了：

id category count
1 12      101     2
2 12      104     0
3 13      102     0
4 14      101     0
5 15      102     1

Answer 3

使用dplyr，您可以执行以下操作：

id = c(12,12,13,14,15)
category = c(101,104,102,101,102)
t1 = data.frame(id, category)     #data.frame format

id = c(12,12,13,13,12,15)
category = c(101,102,101,104,101,102)
t2 = data.frame(id, category)     #data.frame format

require(dplyr)

t1 <- t1 %.%
  group_by(id, category) %.%
  summarize(count_id_cat_in_t2 = sum(t2$category == category & t2$id == id))

请注意，t1和t2创建为data.frame s

Answer 4

这是一个普通的解决方案（我不确定它是否快速）

library(plyr)

id = c(12,12,13,14,15)
category = c(101,104,102,101,102)
t1 = data.frame(id, category)

id = c(12,12,13,13,12,15)
category = c(101,102,101,104,101,102)
t2 = data.frame(id, category)

t3 <- join(t1,
           ddply(t2, .(id, category), c("nrow")),
           by=c("id", "category"))
t3$nrow[is.na(t3$nrow)] <- 0

Answer 5

我们可以使用我的软件包safejoin中的eat，在t2中添加一个常量列，并在将其加入t1时对其进行汇总。

与其他答案一样使用数据帧：

# # devtools::install_github("moodymudskipper/safejoin")
# library(safejoin)

eat(t1, mutate(t2, n=1), n, .agg = sum) %>%
  mutate(n = replace(n, is.na(n), 0))
#   id category n
# 1 12      101 2
# 2 12      104 0
# 3 13      102 0
# 4 14      101 0
# 5 15      102 1

R - 更新t1，其中t2行的计数，其中两列与t1相同

5 个答案: