我想获得Y列中值的所有可能组合,因此我想出了一个嵌套函数,该函数虽然工作良好,但运行时间却很长。
这是一个模拟数据集:
x <- 10000000
set.seed(1)
data <- data.frame(
X = sample(seq(from = 20, to = 50, by = 5), size = x, replace = TRUE),
Y = sample(100:50000,size = x, replace = TRUE),
Year = sample(1990:2018,size = x, replace = TRUE)
)
> head(data)
X Y Year
1 25 26587 2015
2 30 34275 2018
3 40 19226 2015
4 50 47754 2002
5 25 6006 1995
6 50 2051 1992
嵌套功能和所需的输出
data <- data %>%
select(X, Y, Year) %>%
group_by(X,Year) %>%
expand(nesting(Y,Year), Y) %>%
filter(Y<=Y1,Y!=Y1) %>%
select(X,Y,Y1,Year) %>%
arrange(X)
> head(data)
# A tibble: 6 x 4
# Groups: X, Year [2]
X Y Y1 Year
<dbl> <int> <int> <int>
1 20 4933 9210 1990
2 20 4933 42170 1990
3 20 9210 42170 1990
4 20 3983 10981 1991
5 20 3983 29820 1991
6 20 3983 33915 1991
实际数据集的Y和X列均为类字符,包括字母和数字。不幸的是,我无法将它们包括在模拟数据集中,并且我注意到在处理嵌套函数时,数字比字符快得多。
您对如何加快功能或编写data.table
版本的功能有任何建议吗?
答案 0 :(得分:2)
如果您有足够的RAM,可以尝试以下方法:
library(data.table)
ans <- setDT(data)[,
.SD[.SD, on=.(Y<Y), .(Y=x.Y, Y1=i.Y), nomatch=0L, allow.cartesian=TRUE],
by=.(X, Year)]
setcolorder(ans, c("X", "Y", "Y1", "Year"))
ans
与Rui的方法进行时间比较:
library(data.table)
x <- 1e5
set.seed(1)
data <- data.frame(
X = sample(seq(from = 20, to = 50, by = 5), size = x, replace = TRUE),
Y = sample(100:50000,size = x, replace = TRUE),
Year = sample(1990:2018,size = x, replace = TRUE)
)
DF <- data
mtd1 <- function() {
ans <- setDT(data)[, .SD[.SD, on=.(Y<Y), .(Y=x.Y, Y1=i.Y), nomatch=0L, allow.cartesian=TRUE], by=.(X, Year)]
setcolorder(ans, c("X", "Y", "Y1", "Year"))
ans
}
bench::mark(mtd1(), funRui(DF), check=FALSE)
时间:
# A tibble: 2 x 14
expression min mean median max `itr/sec` mem_alloc n_gc n_itr total_time result memory time gc
<chr> <bch:tm> <bch:tm> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <bch:tm> <list> <list> <list> <list>
1 mtd1() 2.17s 2.17s 2.17s 2.17s 0.460 2.3GB 4 1 2.17s <data.table [24,626,111 x ~ <Rprofmem [7,258 x ~ <bch:t~ <tibble [1 x ~
2 funRui(DF) 7.48s 7.48s 7.48s 7.48s 0.134 4.09GB 5 1 7.48s <data.frame [24,626,111 x ~ <Rprofmem [14,618 x~ <bch:t~ <tibble [1 x ~
答案 1 :(得分:1)
像用户@alistaire在comment中说,
如果您要进行一千万行的组合,即使有重复, 它会变得巨大。第一个问题应该是考虑 对于更大的问题是否有更好的方法。
无论如何,以下内容确实将时间减少了7或8倍。
library(tidyverse)
library(microbenchmark)
funOP <- function(DF){
DF %>%
select(X, Y, Year) %>%
group_by(X,Year) %>%
expand(nesting(Y,Year), Y) %>%
filter(Y<=Y1,Y!=Y1) %>%
select(X,Y,Y1,Year) %>%
arrange(X)
}
funRui <- function(DF){
sp <- split(DF, list(DF[["X"]], DF[["Year"]]))
sp <- sp[sapply(sp, nrow) > 0]
res <- lapply(sp, function(df){
if(nrow(df) > 1){
expgrid <- expand.grid(df$Y, df$Y)
expgrid <- expgrid[expgrid[[1]] < expgrid[[2]], ]
if(nrow(expgrid) > 0){
expgrid$X <- df$X[1]
expgrid$Year <- df$Year[1]
expgrid[c(3, 1, 2, 4)]
} else NULL
} else NULL
})
res <- dplyr::bind_rows(res)
res <- res[order(res[[1]]), ]
row.names(res) <- NULL
names(res)[2:3] <- c("Y", "Y1")
res
}
op <- funOP(data)
rui <- funRui(data)
all.equal(op, rui)
#[1] TRUE
microbenchmark(
OP = funOP(data),
Rui = funRui(data),
times = 10
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# OP 987.4617 997.4650 1020.4778 1012.6133 1021.9069 1109.9730 10
# Rui 120.8338 123.9419 137.7035 125.6596 129.0781 245.7496 10
数据创建代码。
我重复数据创建代码以具有较小尺寸的数据框。
x <- 1e3
set.seed(1)
data <- data.frame(
X = sample(seq(from = 20, to = 50, by = 5), size = x, replace = TRUE),
Y = sample(100:50000,size = x, replace = TRUE),
Year = sample(1990:2018,size = x, replace = TRUE)
)