R中的复杂data.table操作

时间:2017-09-05 14:26:14

标签: r data.table

我们假设我有一个人看电影的数据表,比如

library(data.table)
DT = fread("
User,        Movie
Alice ,      Fight Club
Alice,       The Godfather
Bob,         Titanic
Charlotte,   The Godfather")

我想为每对电影计算观看两者的人数和观看至少一部电影的人数,即

Movie1        Movie2           WatchedOne   WatchedBoth
Fight Club    The Godfather    2            1
The Godfather Titanic          3            0
Fight Club    Titanic          2            0

我有数百万行,我需要一个非常快速的data.table函数: - )

感谢您的帮助!

3 个答案:

答案 0 :(得分:2)

另一种方式:

DT = DT[, .(Users = list(User)), keyby='Movie']

Y = data.table(t(combn(DT$Movie, 2)))
setnames(Y, c('Movie1','Movie2'))

Y[DT, on=.(Movie1==Movie), Movie1.Users:= Users]
Y[DT, on=.(Movie2==Movie), Movie2.Users:= Users]

#Y[, WatchedOne:= lengths(Map(union, Movie1.Users, Movie2.Users))]
Y[, WatchedBoth:= lengths(Map(intersect, Movie1.Users, Movie2.Users))]
# better:
Y[, WatchedOne:= lengths(Movie1.Users) + lengths(Movie2.Users) - WatchedBoth]

> Y[, -(3:4)]
#           Movie1        Movie2 WatchedBoth WatchedOne
# 1:    Fight Club The Godfather           1          2
# 2:    Fight Club       Titanic           0          2
# 3: The Godfather       Titanic           0          3

答案 1 :(得分:1)

这实现了你的目标

library(data.table)

mydt <- data.table(User = c("Alice", "Alice", "Bob", "Charlotte"), 
               Movie = c("Fight Club", "The Godfather", "Titanic", "The Godfather"))
##
mydt2 <-  data.table(t(mydt[,combn(unique(Movie), 2, simplify = FALSE)]))
names(mydt2) <- c("Movie1", "Movie2")
##
temp <- apply(mydt2, 1, function(x) mydt[Movie %in% x, .N, by = User])
mydt2[, WatchedOne := lapply(temp, function(x) x[, length(N)])]
mydt2[, WatchedBoth := lapply(temp, function(x) x[, sum(N==2)])]

# Movie1        Movie2 WatchedOne WatchedBoth
# 1:    Fight Club The Godfather          2           1
# 2:    Fight Club       Titanic          2           0
# 3: The Godfather       Titanic          3           0

答案 2 :(得分:0)

@sirallen @simone 谢谢你的回答,我尝试了两种方式。 但是,我找到了最快的方法:

DT_comb <- as.data.table( t( combn( movie, 2) ) )

colnames(DT_comb) <- c("movie1", "movie2")

function_1 <- function(movie_i, movie_j){
  ur_i = DT[movie == movie_i, user_ID]
  ur_j = DT[movie == movie_j, user_ID]
  x = length(intersect(ur_i, ur_j))
  return(x)
}

function_2 <- function(movie_i, movie_j){
  ur_i = DT[movie == movie_i, user_ID]
  ur_j = DT[movie == movie_j, user_ID]
  x = length(union(ur_i, ur_j))
  return(x)
}

cl <- makeCluster(detectCores() - 1)

clusterExport(cl=cl, varlist=c("DT", "function_1", "function_2"))

clusterCall(cl, function() library(data.table))

DT_comb$Watched_One <- clusterMap(cl,
                                  function_1,
                                  DT_corr$movie1,
                                  DT_corr$movie2)

DT_comb$Watched_Both <- clusterMap(cl,
                                   function_2,
                                   DT_corr$movie1,
                                   DT_corr$movie2)

stopCluster(cl)

在并行化时,你的解决方案可能比我的解决方案更快? : - )