R - 计算data.frame中成对比较组中的相似值

时间:2017-12-22 23:05:13

标签: r dataframe intersection

我有篮球运动员的观察数据。每个ID代表一个玩家。

df <- data.frame(id = c("A", "B", "c"),
                  V1 = c(1, 3, 2),
                  V2 = c(1, 2, 2),
                  V3 = c(3, 1, NA))
df
  id V1 V2 V3
1  A  1  1  3
2  B  3  2  1
3  c  2  2 NA

我想成对地比较所有玩家并计算他们变量之间的相似数量。

如果在不同的列中找到值,则无关紧要。请注意,某些玩家在某些字段中有NA

期望的结果应该是这样的:

desired <- data.frame(id_x = c("A", "A", "B"),
                      id_y = c("B", "C", "C"),
                      similar = c(2, 0, 1))
desired
  id_x id_y similar
1    A    B       2
2    A    C       0
3    B    C       1

真实数据由数万名玩家组成,因此表现也很重要。

非常感谢任何指针。

4 个答案:

答案 0 :(得分:1)

将提供两种不同的方法:

 A=lapply(apply(df[-1],1,list),unlist)
 combn(A,2,function(x)sum(unique(na.omit(x[[1]]))%in%unique(na.omit(x[[2]]))))
  [1] 2 0 1

 B=apply(df[-1],1,function(x)apply(df[-1],1,function(y)sum(unique(na.omit(x))%in%unique(na.omit(y)))))
 B[lower.tri(B)]
 [1] 2 0 1

答案 1 :(得分:1)

我们可以列出每对行的列表,并使用它们来查找它们的相交。看下面:

toCheck <- combn(rownames(df), 2, simplify = FALSE)
names(toCheck) <-
  sapply(toCheck, paste, collapse = "&") 

sapply(toCheck, function(x){
  length(base::intersect(as.list(df[x[1],-1]), as.list(df[x[2],-1])))
})

# 1&2 1&3 2&3 
#   2   0   1

在更大的数据集上进行测试:

set.seed(45)
df2 <- data.frame(ID = sample(10e2),
                  V1 = sample(1:15, 10e2, replace = TRUE),
                  V2 = sample(1:16, 10e2, replace = TRUE),
                  V3 = sample(1:17, 10e2, replace = TRUE)) 

M_M_approach <- function(mdf) {
  Check <- combn(rownames(mdf), 2, simplify = FALSE)
  names(Check) <-  sapply(Check, paste, collapse = "&")

  sapply(Check, function(x){
    length(base::intersect(as.list(mdf[x[1],-1]), as.list(mdf[x[2],-1]))) })
}

M_M_approach(df2)
# 1&2 1&3 2&3 
#   1   1   2 

microbenchmark::microbenchmark(M_M_approach = M_M_approach(df2), times = 5)
# Unit: milliseconds
#          expr      min       lq     mean   median       uq      max neval
#  M_M_approach 225.6985 228.1924 248.5623 250.4814 255.1007 283.3385     5

答案 2 :(得分:0)

我对Onyambu提供的重要答案进行了基准测试。

制作更大的测试样本:

df2 <- data.frame(ID = sample(10e2),
                 V1 = sample(1:15, 10e2, replace = TRUE),
                 V2 = sample(2:16, 10e2, replace = TRUE),
                 V3 = sample(3:17, 10e2, replace = TRUE))

运行基准:

library(microbenchmark)
bench <- microbenchmark(

# option A
A=lapply(apply(df2[-1],1,list),unlist),
A1=combn(A,2,function(x)sum(unique(x[[1]])%in%unique(x[[2]]))),

# option B
B=apply(my.df2[-1],1,function(x)apply(df2[-1],1,function(y)sum(unique(x)%in%uni
  que(y)))),
B2= B[lower.tri(B)],

# repeat 5 times
times=5)

产地:

bench

Unit: milliseconds
 expr      min          lq        mean      median          uq         max neval cld
A     10.44847    10.83849    11.79438    11.33756    11.34568    15.00171     5 a  
A1 25420.53573 25735.88333 26721.22973 25802.89428 26658.98114 29987.85417     5  b 
B  52173.85540 52519.34839 53327.35931 52661.64372 54508.70321 54773.24582     5   c
B2    33.43663    34.16278    34.91674    35.19001    35.81182    35.98246     5 a  

原始数据较大。

是否有更高性能的选项?

答案 3 :(得分:0)

也许您也可以使用proxy来解决此问题:

connectedComponents(InputArray image, OutputArray labels, int connectivity=8, int ltype=CV_32S)

在我的机器上,它的确似乎更快一些:

library(proxy)

df <- data.frame(id = c("A", "B", "c"),
                 V1 = c(1, 3, 2),
                 V2 = c(1, 2, 2),
                 V3 = c(3, 1, NA))

myfun <- function(x, y) {
  sum(unique(setdiff(x, NA)) %in% y)
}

pr_DB$set_entry(FUN=myfun, names="myfun", distance=FALSE, loop=TRUE)

similar <- proxy::simil(df[, -1L], method="myfun")

res <- combn(df$id, 2L)
res <- data.frame(id_x=res[2L,], id_y=res[1L,])
res$similar <- as.integer(similar)

print(res)
  id_x id_y similar
1    B    A       2
2    c    A       0
3    c    B       1