比嵌套更快

时间:2019-07-04 19:34:26

标签: r dplyr data.table

我想获得Y列中值的所有可能组合,因此我想出了一个嵌套函数,该函数虽然工作良好,但运行时间却很长。

这是一个模拟数据集:

x <- 10000000
set.seed(1)
data <- data.frame(
  X = sample(seq(from = 20, to = 50, by = 5), size = x, replace = TRUE),
  Y = sample(100:50000,size = x, replace = TRUE),
  Year = sample(1990:2018,size = x, replace = TRUE)
)

> head(data)
   X     Y Year
1 25 26587 2015
2 30 34275 2018
3 40 19226 2015
4 50 47754 2002
5 25  6006 1995
6 50  2051 1992

嵌套功能和所需的输出

data <- data %>% 
  select(X, Y, Year) %>% 
  group_by(X,Year) %>% 
  expand(nesting(Y,Year), Y) %>% 
  filter(Y<=Y1,Y!=Y1) %>% 
  select(X,Y,Y1,Year) %>% 
  arrange(X)

> head(data)
# A tibble: 6 x 4
# Groups:   X, Year [2]
      X     Y    Y1  Year
  <dbl> <int> <int> <int>
1    20  4933  9210  1990
2    20  4933 42170  1990
3    20  9210 42170  1990
4    20  3983 10981  1991
5    20  3983 29820  1991
6    20  3983 33915  1991

实际数据集的Y和X列均为类字符,包括字母和数字。不幸的是,我无法将它们包括在模拟数据集中,并且我注意到在处理嵌套函数时,数字比字符快得多。

您对如何加快功能或编写data.table版本的功能有任何建议吗?

2 个答案:

答案 0 :(得分:2)

如果您有足够的RAM,可以尝试以下方法:

library(data.table)
ans <- setDT(data)[, 
    .SD[.SD, on=.(Y<Y), .(Y=x.Y, Y1=i.Y), nomatch=0L, allow.cartesian=TRUE], 
    by=.(X, Year)]
setcolorder(ans, c("X", "Y", "Y1", "Year"))
ans

与Rui的方法进行时间比较:

library(data.table)
x <- 1e5
set.seed(1)
data <- data.frame(
    X = sample(seq(from = 20, to = 50, by = 5), size = x, replace = TRUE),
    Y = sample(100:50000,size = x, replace = TRUE),
    Year = sample(1990:2018,size = x, replace = TRUE)
)
DF <- data

mtd1 <- function() {
    ans <- setDT(data)[, .SD[.SD, on=.(Y<Y), .(Y=x.Y, Y1=i.Y), nomatch=0L, allow.cartesian=TRUE], by=.(X, Year)]
    setcolorder(ans, c("X", "Y", "Y1", "Year"))
    ans
}

bench::mark(mtd1(), funRui(DF), check=FALSE)

时间:

# A tibble: 2 x 14
  expression      min     mean   median      max `itr/sec` mem_alloc  n_gc n_itr total_time result                      memory               time    gc            
  <chr>      <bch:tm> <bch:tm> <bch:tm> <bch:tm>     <dbl> <bch:byt> <dbl> <int>   <bch:tm> <list>                      <list>               <list>  <list>        
1 mtd1()        2.17s    2.17s    2.17s    2.17s     0.460     2.3GB     4     1      2.17s <data.table [24,626,111 x ~ <Rprofmem [7,258 x ~ <bch:t~ <tibble [1 x ~
2 funRui(DF)    7.48s    7.48s    7.48s    7.48s     0.134    4.09GB     5     1      7.48s <data.frame [24,626,111 x ~ <Rprofmem [14,618 x~ <bch:t~ <tibble [1 x ~

答案 1 :(得分:1)

像用户@alistaire在comment中说,

  

如果您要进行一千万行的组合,即使有重复,   它会变得巨大。第一个问题应该是考虑   对于更大的问题是否有更好的方法。

无论如何,以下内容确实将时间减少了7或8倍。

library(tidyverse)
library(microbenchmark)

funOP <- function(DF){
  DF %>% 
    select(X, Y, Year) %>% 
    group_by(X,Year) %>% 
    expand(nesting(Y,Year), Y) %>% 
    filter(Y<=Y1,Y!=Y1) %>% 
    select(X,Y,Y1,Year) %>% 
    arrange(X)
}

funRui <- function(DF){
  sp <- split(DF, list(DF[["X"]], DF[["Year"]]))
  sp <- sp[sapply(sp, nrow) > 0]
  res <- lapply(sp, function(df){
    if(nrow(df) > 1){
      expgrid <- expand.grid(df$Y, df$Y)
      expgrid <- expgrid[expgrid[[1]] < expgrid[[2]], ]
      if(nrow(expgrid) > 0){
        expgrid$X <- df$X[1]
        expgrid$Year <- df$Year[1]
        expgrid[c(3, 1, 2, 4)]
      } else NULL
    } else NULL
  })
  res <- dplyr::bind_rows(res)
  res <- res[order(res[[1]]), ]
  row.names(res) <- NULL
  names(res)[2:3] <- c("Y", "Y1")
  res
}


op <- funOP(data)
rui <- funRui(data)
all.equal(op, rui)
#[1] TRUE


microbenchmark(
  OP = funOP(data),
  Rui = funRui(data),
  times = 10
)
#Unit: milliseconds
# expr      min       lq      mean    median        uq       max neval
#   OP 987.4617 997.4650 1020.4778 1012.6133 1021.9069 1109.9730    10
#  Rui 120.8338 123.9419  137.7035  125.6596  129.0781  245.7496    10

数据创建代码。

我重复数据创建代码以具有较小尺寸的数据框。

x <- 1e3
set.seed(1)
data <- data.frame(
  X = sample(seq(from = 20, to = 50, by = 5), size = x, replace = TRUE),
  Y = sample(100:50000,size = x, replace = TRUE),
  Year = sample(1990:2018,size = x, replace = TRUE)
)