我在R中有两个数据框,我需要逐行计算元素匹配,最后获得一个列,其中包含两个表的笛卡尔积的长度和两行的ID。此外,表格非常大,行数不同,但列数相同。
我有以下代码,但多次运行时速度很慢。
library(data.table)
table_1<-data.table(matrix(c(1:24),nrow = 4))
table_2<-data.table(matrix(c(11:34),nrow = 4))
names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")
table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))
setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))
CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
join<-CJ.table(table_1,table_2)
R<-subset(join, select=c("ID_ap","ID"))
R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+
(join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+
(join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+
(join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+
(join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+
(join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)
给出了
R
ID_ap ID Ac
1: 1 1 0
2: 1 2 0
3: 1 3 4
4: 1 4 0
5: 2 1 0
6: 2 2 0
7: 2 3 0
8: 2 4 4
9: 3 1 3
10: 3 2 0
11: 3 3 0
12: 3 4 0
13: 4 1 0
14: 4 2 3
15: 4 3 0
16: 4 4 0
答案 0 :(得分:4)
将数据放入长格式,因为列顺序无关紧要:
setnames(table_2, "ID_ap", "ID")
tabs = rbind(
melt(table_1, id="ID")[, variable := NULL],
melt(table_2, id="ID")[, variable := NULL],
idcol = TRUE)
(1)对于每个值,确定相关对;和
(2)对,对,计数值:
tabs[,
if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
, by=value][,
.N
, by=.(ID1, ID2)]
ID1 ID2 N
1: 3 1 4
2: 4 2 4
3: 1 3 3
4: 2 4 3
我认为所有其他(ID1, ID2)
组合都是零,无需明确枚举。
如果每个表中的值不同,就像在OP的示例中那样,那么我们可以简化:
tabs[, if (.N==2L) .(ID1 = ID[1L], ID2 = ID[2L]), by=value][, .N, by=.(ID1, ID2)]
答案 1 :(得分:3)
假设两个表中行数和唯一值的乘积不大:
x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)
具有共同的唯一值:
lvs = union(x1, x2)
tabulate
每个表的每一行中每个唯一值的出现:
tab1 = matrix(tabulate(seq_len(nrow(table_1)) + (match(x1, lvs) - 1L) * nrow(table_1),
nrow(table_1) * length(lvs)),
nrow(table_1), length(lvs))
tab2 = matrix(tabulate(seq_len(nrow(table_2)) + (match(x2, lvs) - 1L) * nrow(table_2),
nrow(table_2) * length(lvs)),
nrow(table_2), length(lvs))
最后:
tcrossprod(tab1, tab2) #or 'tcrossprod(tab1 > 0L, tab2 > 0L)' to not count duplicate matches
# [,1] [,2] [,3] [,4]
#[1,] 0 0 3 0
#[2,] 0 0 0 3
#[3,] 4 0 0 0
#[4,] 0 4 0 0
#and to change format (among different ways):
ans = tcrossprod(tab1, tab2)
cbind(c(row(ans)), c(col(ans)), c(ans))
如果tab1
和tab2
非常大,可以将它们构建为稀疏矩阵,方法可以是:
library(Matrix)
stab1 = xtabs(rep_len(1L, length(x1)) ~
rep_len(seq_len(nrow(table_1)), length(x1))
+ factor(match(x1, lvs), lvs),
sparse = TRUE)
stab2 = xtabs(rep_len(1L, length(x2)) ~
rep_len(seq_len(nrow(table_2)), length(x2))
+ factor(match(x2, lvs), lvs),
sparse = TRUE)
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
# 1 2 3 4
#1 . . 3 .
#2 . . . 3
#3 4 . . .
#4 . 4 . .
修改强>
在每行中有(1)小的正整数值和(2)不同的值,使用match
/ unique
/ union
创建查找并且可以避免制表:
x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)
nlvs = max(max(x1), max(x2))
stab1 = sparseMatrix(i = rep_len(seq_len(nrow(table_1)), length(x1)),
j = x1,
x = 1L,
dims = c(nrow(table_1), nlvs))
stab2 = sparseMatrix(i = rep_len(seq_len(nrow(table_2)), length(x2)),
j = x2,
x = 1L,
dims = c(nrow(table_2), nlvs))
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
#
#[1,] . . 3 .
#[2,] . . . 3
#[3,] 4 . . .
#[4,] . 4 . .
summary(tcrossprod(stab1, stab2))
#4 x 4 sparse Matrix of class "dgCMatrix", with 4 entries
# i j x
#1 3 1 4
#2 4 2 4
#3 1 3 3
#4 2 4 3
答案 2 :(得分:2)
怎么样:
colSums(apply(join[, !c("ID", "ID_ap"), with = F], 1, duplicated))
#[1] 0 0 4 0 0 0 0 4 3 0 0 0 0 3 0 0
或者,从头开始:
setkey(table_1, ID)
setkey(table_2, ID_ap)
ids = CJ(ID1 = table_1$ID, ID2 = table_2$ID_ap)
ids[, sum(duplicated(c(table_1[.(ID1), !'ID', with = F],
table_2[.(ID2), !'ID_ap', with = F])))
, by = .(ID1, ID2)]
# ID1 ID2 V1
# 1: 1 1 0
# 2: 1 2 0
# 3: 1 3 3
# 4: 1 4 0
# 5: 2 1 0
# 6: 2 2 0
# 7: 2 3 0
# 8: 2 4 3
# 9: 3 1 4
#10: 3 2 0
#11: 3 3 0
#12: 3 4 0
#13: 4 1 0
#14: 4 2 4
#15: 4 3 0
#16: 4 4 0
答案 3 :(得分:2)
在帖子中没有明确说明性能要求。但是,我已经创建了一个更大版本的可重现示例(如下所示),问题中的代码已经非常快了。
这里是如何在基地R中做到这一点,为了更好的衡量标准:
t1 <- as.data.frame(table_1)
t2 <- as.data.frame(table_2)
system.time({
## compute all combinations of indices
indices <- merge(t1[1], t2[1])
## create a mega df including all rows, cbinded together
rows <- cbind(t1[indices[ ,"ID"], -1], t2[indices[ , "ID_ap"], -1])
t1_cols <- names(rows) %in% names(t1)
t2_cols <- names(rows) %in% names(t2)
## compute the counts; this step takes most of the time
## ~ 14 of the 18 second in this example
counts <- apply(rows, 1, function(r) sum(r[t1_cols] %in% r[t2_cols]))
})
out <- data.frame(indices, Ac=counts)
例如,对于来自下方(dim(out) == c(1e6, 3)
)的大型可重现问题,上述代码运行时间不到20秒。
user system elapsed
17.879 0.348 18.245
修改重复性较大的问题:
library(data.table)
NROW <- 1e4
NROW2 <- 1e2
table_1<-data.table(matrix(c(1:24),nrow = NROW, ncol=6))
table_2<-data.table(matrix(c(11:34),nrow = NROW2, ncol=6))
names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")
table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))
setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))
OP的解决方案比
更快地 运行CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
join<-CJ.table(table_1,table_2)
R<-subset(join, select=c("ID_ap","ID"))
system.time({
R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+
(join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+
(join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+
(join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+
(join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+
(join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)
})
# user system elapsed
# 0.295 0.044 0.339
但弗兰克答案中的解决方案仍然更快
setnames(table_2, "ID_ap", "ID")
tabs = rbind(
melt(table_1, id="ID")[, variable := NULL],
melt(table_2, id="ID")[, variable := NULL],
idcol = TRUE)
system.time({out3 <- tabs[,
if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
, by=value][,
.N
, by=.(ID1, ID2)]})
# user system elapsed
# 0.109 0.013 0.122
答案 4 :(得分:1)
这是一种可能性:
> t1<-data.frame(matrix(c(1:24),nrow = 4))
> t2<-data.frame(matrix(c(11:34),nrow = 4))
> ret<-expand.grid(r1=1:nrow(t1),r2=1:nrow(t2))
> ret$matches<-apply(ret,1,function(a)sum(t1[a[1],] %in% t2[a[2],]))
> ret
r1 r2 matches
1 1 1 0
2 2 1 0
3 3 1 4
4 4 1 0
5 1 2 0
6 2 2 0
7 3 2 0
8 4 2 4
9 1 3 3
10 2 3 0
11 3 3 0
12 4 3 0
13 1 4 0
14 2 4 3
15 3 4 0
16 4 4 0