我有一些相当简单的R代码,需要10min-20min的时间执行,我认为这是不必要的耗时。数据由大约30列和500.000行的数据帧组成。循环的目的是查看应将某个值放入哪种bin。
我试图通过在循环之前添加整列,在阅读有关该主题的其他线程后在循环外进行一些计算来改善代码,但是这些方法都没有显着改善代码。
col_days <- Sys.Date() - as.Date(df$col)
i=1
while (i < length(df$col)){
if (Sys.Date() - as.Date(df$col[i]) <366){
df$col_bin[i] <- "Less than 1 year"
i=i+1
}
else if (between(Sys.Date() - as.Date(df$col[i]), 366, 1095)){
df$col_bin[i] <- "1 year to 3 years"
i=i+1
}
else if (between(Sys.Date() - as.Date(df$col[i]), 1096, 1825)){
df$col_bin[i] <- "3 years to 5 years"
i=i+1
}
else if (between(Sys.Date() - as.Date(df$col[i]), 1826, 3650)){
df$col_bin[i] <- "5 years to 10 years"
i=i+1
}
else{
df$col_bin[i] <- "More than 10 years"
i=i+1
}
}
因此,使用此版本的代码,大约需要15分钟来计算所有行。我相信有几种方法可以改善这一点。有建议吗?
答案 0 :(得分:5)
这是使用dplyr::case_when()
(比base::cut()
更容易处理)的解决方案:
library(dplyr)
df %>%
mutate(
col_bin = case_when(
days < 366 ~ "Less than 1 year",
days < 1095 ~ "1 year to 3 years",
days < 1825 ~ "3 years to 5 years",
days < 3650 ~ "5 years to 10 years",
TRUE ~ "More than 10 years"
)
)
col days col_bin
1 2012-02-27 2538 days 5 years to 10 years
2 2014-11-27 1534 days 3 years to 5 years
3 2013-04-06 2134 days 5 years to 10 years
4 2009-08-15 3464 days 5 years to 10 years
5 2017-12-09 426 days 1 year to 3 years
6 2016-01-08 1127 days 3 years to 5 years
7 2015-05-08 1372 days 3 years to 5 years
8 2015-05-20 1360 days 3 years to 5 years
9 2010-09-08 3075 days 5 years to 10 years
10 2013-03-26 2145 days 5 years to 10 years
11 2010-03-15 3252 days 5 years to 10 years
12 2011-05-08 2833 days 5 years to 10 years
13 2017-07-21 567 days 1 year to 3 years
示例数据:
set.seed(10)
df <- data.frame(
col = Sys.Date() - sample(1:5000, size = 13)
)
df[["days"]] <- Sys.Date() - as.Date(df[["col"]])
答案 1 :(得分:2)
这里是使用dplyr
或data.table
以及case_when
和cut
的四个解决方案的比较。
感谢snoram提供示例数据以及dplyr
和case_when
部分。
在此测试中,dplyr
和data.table
的性能大致相同,但是cut
比case_when
快。与原始解决方案相比,所有解决方案的速度都应该更快,可能对于数据集大小的数据集来说绝对足够快。
require(data.table)
require(dplyr)
require(microbenchmark)
require(ggplot2)
set.seed(10)
df <- data.frame(
col = Sys.Date() - sample(1:5000, size = 13)
)
df[["days"]] <- Sys.Date() - as.Date(df[["col"]])
benchmark <- microbenchmark(
data.table={
dt <- data.table(df)
dt[, col_bin := cut(
as.numeric(days, units="days"),
breaks=c(-Inf, 366, 1095, 1825, 3650, Inf),
labels=c(
"Less than 1 year",
"1 year to 3 years",
"3 years to 5 years",
"5 years to 10 years",
"More than 10 years"
))]
},
dplyr={
res <- df %>%
mutate(
col_bin = case_when(
days < 366 ~ "Less than 1 year",
days < 1095 ~ "1 year to 3 years",
days < 1825 ~ "3 years to 5 years",
days < 3650 ~ "5 years to 10 years",
TRUE ~ "More than 10 years"
)
)
},
`data.table & case_when`={
dt <- data.table(df)
dt[, col_bin := case_when(
days < 366 ~ "Less than 1 year",
days < 1095 ~ "1 year to 3 years",
days < 1825 ~ "3 years to 5 years",
days < 3650 ~ "5 years to 10 years",
TRUE ~ "More than 10 years"
)]
},
`dplyr & cut`={
res <- df %>%
mutate(
col_bin = cut(
as.numeric(days, units="days"),
breaks=c(-Inf, 366, 1095, 1825, 3650, Inf),
labels=c(
"Less than 1 year",
"1 year to 3 years",
"3 years to 5 years",
"5 years to 10 years",
"More than 10 years"
))
)
}
)
autoplot(benchmark)