我有一个带有日期的数据集,例如。
id <- 1:1000
admission_date <- sample(seq(as.Date('2016/01/01'), as.Date('2018/12/31'), by="day"), 1000)
discharge_date <- admission_date + days(100)
extract <- tibble(id, admission_date, discharge_date)
我需要将天数归因于相关季度。我有一些代码可以做到这一点;
min_date <- min(extract$admission_date)
max_date <- max(extract$discharge_date)
for (year in year(min_date):year(max_date)) {
for (quarter in 1:4) {
min_start_date <- yq(paste(year, quarter)) - days(1)
max_end_date <- yq(paste(year, quarter)) + months(3) - days(1)
extract <-
extract %>% mutate(
!!paste(year, quarter) := case_when(
# doa before start of period and dod after end of month (or missing dod) - end of month minus start of month
(
admission_date < min_start_date &
discharge_date > max_end_date
) ~ time_length(min_start_date %--% max_end_date, "days"),
# doa equal or greater to start of period (but within month) and dod after end of month (or missing dod) - end of month minus doa
(
admission_date >= min_start_date &
admission_date <= max_end_date &
discharge_date > max_end_date
) ~ time_length(admission_date %--% max_end_date, "days"),
# doa on or before start of period and dod on or before end of month (but within month) - dod minus start of month
(
admission_date <= min_start_date &
discharge_date <= max_end_date &
discharge_date > min_start_date
) ~ time_length(min_start_date %--% discharge_date, "days"),
# remainder - doa after start of period and dod on or before end of period - dod minus doa
(
admission_date > min_start_date &
discharge_date <= max_end_date
) ~ time_length(admission_date %--% discharge_date, "days"),
TRUE ~ 0
)
)
}
}
但是它很慢(我的实际数据有200万以上的行),我认为可以通过将其包装到函数中,然后使用purrr(或可能是furrr)来改善。
到目前为止,这就是我所想的,它似乎挂起了,所以我什至不知道问题出在哪里...
test <- function(data, year, quarter) {
min_start_date <- yq(paste(year, quarter)) - days(1)
max_end_date <- yq(paste(year, quarter)) + months(3) - days(1)
data <-
data %>% transmute(
!!paste(year, quarter) := case_when(
# doa before start of period and dod after end of month (or missing dod) - end of month minus start of month
(
admission_date < min_start_date &
discharge_date > max_end_date
) ~ time_length(min_start_date %--% max_end_date, "days"),
# doa equal or greater to start of period (but within month) and dod after end of month (or missing dod) - end of month minus doa
(
admission_date >= min_start_date &
admission_date <= max_end_date &
discharge_date > max_end_date
) ~ time_length(admission_date %--% max_end_date, "days"),
# doa on or before start of period and dod on or before end of month (but within month) - dod minus start of month
(
admission_date <= min_start_date &
discharge_date <= max_end_date &
discharge_date > min_start_date
) ~ time_length(min_start_date %--% discharge_date, "days"),
# remainder - doa after start of period and dod on or before end of period - dod minus doa
(
admission_date > min_start_date &
discharge_date <= max_end_date
) ~ time_length(admission_date %--% discharge_date, "days"),
TRUE ~ 0
)
)
return(data)
}
years = as.list(rep(year(min_date):year(max_date), 4))
quarters = as.list(rep(1:4, length(years) / 4))
library(purrr)
extract2 <- extract %>% pmap(years, quarters, test)
答案 0 :(得分:1)
首先创建自定义函数的参数输入列表
years = as.list(rep(year(min_date):year(max_date), 4))
quarters = as.list(rep(1:4, length(years) / 4))
param <- purrr::cross2(years, quarters)
然后创建一个自定义函数,将参数列表和数据作为输入
test <- function(param, data) {
year <- param[[1]]
quarter <- param[[2]]
min_start_date <- yq(paste(year, quarter)) - days(1)
max_end_date <- yq(paste(year, quarter)) + months(3) - days(1)
data <-
data %>% transmute(
!!paste(year, quarter) := case_when(
# doa before start of period and dod after end of month (or missing dod) - end of month minus start of month
(
admission_date < min_start_date &
discharge_date > max_end_date
) ~ time_length(min_start_date %--% max_end_date, "days"),
# doa equal or greater to start of period (but within month) and dod after end of month (or missing dod) - end of month minus doa
(
admission_date >= min_start_date &
admission_date <= max_end_date &
discharge_date > max_end_date
) ~ time_length(admission_date %--% max_end_date, "days"),
# doa on or before start of period and dod on or before end of month (but within month) - dod minus start of month
(
admission_date <= min_start_date &
discharge_date <= max_end_date &
discharge_date > min_start_date
) ~ time_length(min_start_date %--% discharge_date, "days"),
# remainder - doa after start of period and dod on or before end of period - dod minus doa
(
admission_date > min_start_date &
discharge_date <= max_end_date
) ~ time_length(admission_date %--% discharge_date, "days"),
TRUE ~ 0
)
)
data
}
然后使用purrr
或furrr
library(purrr)
extract2 <- purrr::map_dfc(param, test, extract)
library(furrr)
plan(multicore(workers = 8))
extract3 <- furrr::future_map_dfc(param, test, extract)