我们假设我有一个人看电影的数据表,比如
library(data.table)
DT = fread("
User, Movie
Alice , Fight Club
Alice, The Godfather
Bob, Titanic
Charlotte, The Godfather")
我想为每对电影计算观看两者的人数和观看至少一部电影的人数,即
Movie1 Movie2 WatchedOne WatchedBoth
Fight Club The Godfather 2 1
The Godfather Titanic 3 0
Fight Club Titanic 2 0
我有数百万行,我需要一个非常快速的data.table函数: - )
感谢您的帮助!
答案 0 :(得分:2)
另一种方式:
DT = DT[, .(Users = list(User)), keyby='Movie']
Y = data.table(t(combn(DT$Movie, 2)))
setnames(Y, c('Movie1','Movie2'))
Y[DT, on=.(Movie1==Movie), Movie1.Users:= Users]
Y[DT, on=.(Movie2==Movie), Movie2.Users:= Users]
#Y[, WatchedOne:= lengths(Map(union, Movie1.Users, Movie2.Users))]
Y[, WatchedBoth:= lengths(Map(intersect, Movie1.Users, Movie2.Users))]
# better:
Y[, WatchedOne:= lengths(Movie1.Users) + lengths(Movie2.Users) - WatchedBoth]
> Y[, -(3:4)]
# Movie1 Movie2 WatchedBoth WatchedOne
# 1: Fight Club The Godfather 1 2
# 2: Fight Club Titanic 0 2
# 3: The Godfather Titanic 0 3
答案 1 :(得分:1)
这实现了你的目标
library(data.table)
mydt <- data.table(User = c("Alice", "Alice", "Bob", "Charlotte"),
Movie = c("Fight Club", "The Godfather", "Titanic", "The Godfather"))
##
mydt2 <- data.table(t(mydt[,combn(unique(Movie), 2, simplify = FALSE)]))
names(mydt2) <- c("Movie1", "Movie2")
##
temp <- apply(mydt2, 1, function(x) mydt[Movie %in% x, .N, by = User])
mydt2[, WatchedOne := lapply(temp, function(x) x[, length(N)])]
mydt2[, WatchedBoth := lapply(temp, function(x) x[, sum(N==2)])]
# Movie1 Movie2 WatchedOne WatchedBoth
# 1: Fight Club The Godfather 2 1
# 2: Fight Club Titanic 2 0
# 3: The Godfather Titanic 3 0
答案 2 :(得分:0)
@sirallen @simone 谢谢你的回答,我尝试了两种方式。 但是,我找到了最快的方法:
DT_comb <- as.data.table( t( combn( movie, 2) ) )
colnames(DT_comb) <- c("movie1", "movie2")
function_1 <- function(movie_i, movie_j){
ur_i = DT[movie == movie_i, user_ID]
ur_j = DT[movie == movie_j, user_ID]
x = length(intersect(ur_i, ur_j))
return(x)
}
function_2 <- function(movie_i, movie_j){
ur_i = DT[movie == movie_i, user_ID]
ur_j = DT[movie == movie_j, user_ID]
x = length(union(ur_i, ur_j))
return(x)
}
cl <- makeCluster(detectCores() - 1)
clusterExport(cl=cl, varlist=c("DT", "function_1", "function_2"))
clusterCall(cl, function() library(data.table))
DT_comb$Watched_One <- clusterMap(cl,
function_1,
DT_corr$movie1,
DT_corr$movie2)
DT_comb$Watched_Both <- clusterMap(cl,
function_2,
DT_corr$movie1,
DT_corr$movie2)
stopCluster(cl)
在并行化时,你的解决方案可能比我的解决方案更快? : - )