使用purrr和mutate将多个列添加到数据框

时间:2019-07-09 16:14:03

标签: r dplyr tidyverse purrr

我有一个带有日期的数据集,例如。

id <- 1:1000
admission_date <- sample(seq(as.Date('2016/01/01'), as.Date('2018/12/31'), by="day"), 1000)
discharge_date <- admission_date + days(100)

extract <- tibble(id, admission_date, discharge_date)

我需要将天数归因于相关季度。我有一些代码可以做到这一点;

min_date <- min(extract$admission_date)
max_date <- max(extract$discharge_date)


for (year in year(min_date):year(max_date)) {
  for (quarter in 1:4) {
    min_start_date <- yq(paste(year, quarter)) - days(1)
    max_end_date <- yq(paste(year, quarter)) + months(3) - days(1)

    extract <-
      extract %>% mutate(
        !!paste(year, quarter) := case_when(
          # doa before start of period and dod after end of month (or missing dod) - end of month minus start of month
          (
            admission_date < min_start_date &
              discharge_date > max_end_date
          ) ~ time_length(min_start_date %--% max_end_date, "days"),
          # doa equal or greater to start of period (but within month) and dod after end of month (or missing dod)  - end of month minus doa
          (
            admission_date >= min_start_date &
              admission_date <= max_end_date &
              discharge_date > max_end_date
          ) ~ time_length(admission_date %--% max_end_date, "days"),
          # doa on or before start of period and dod on or before end of month (but within month)  - dod minus start of month
          (
            admission_date <= min_start_date &
              discharge_date <= max_end_date &
              discharge_date > min_start_date
          ) ~ time_length(min_start_date %--% discharge_date, "days"),
          # remainder - doa after start of period and dod on or before end of period  - dod minus doa
          (
            admission_date > min_start_date &
              discharge_date <= max_end_date
          ) ~ time_length(admission_date %--% discharge_date, "days"),
          TRUE ~ 0
        )
      )
  }
}

但是它很慢(我的实际数据有200万以上的行),我认为可以通过将其包装到函数中,然后使用purrr(或可能是furrr)来改善。

到目前为止,这就是我所想的,它似乎挂起了,所以我什至不知道问题出在哪里...

test <- function(data, year, quarter) {
  min_start_date <- yq(paste(year, quarter)) - days(1)
  max_end_date <- yq(paste(year, quarter)) + months(3) - days(1)

  data <-
    data %>% transmute(
      !!paste(year, quarter) := case_when(
        # doa before start of period and dod after end of month (or missing dod) - end of month minus start of month
        (
          admission_date < min_start_date &
            discharge_date > max_end_date
        ) ~ time_length(min_start_date %--% max_end_date, "days"),
        # doa equal or greater to start of period (but within month) and dod after end of month (or missing dod)  - end of month minus doa
        (
          admission_date >= min_start_date &
            admission_date <= max_end_date &
            discharge_date > max_end_date
        ) ~ time_length(admission_date %--% max_end_date, "days"),
        # doa on or before start of period and dod on or before end of month (but within month)  - dod minus start of month
        (
          admission_date <= min_start_date &
            discharge_date <= max_end_date &
            discharge_date > min_start_date
        ) ~ time_length(min_start_date %--% discharge_date, "days"),
        # remainder - doa after start of period and dod on or before end of period  - dod minus doa
        (
          admission_date > min_start_date &
            discharge_date <= max_end_date
        ) ~ time_length(admission_date %--% discharge_date, "days"),
        TRUE ~ 0
      )
    )
  return(data)
}


years = as.list(rep(year(min_date):year(max_date), 4))
quarters = as.list(rep(1:4, length(years) / 4))

library(purrr)

extract2 <- extract %>% pmap(years, quarters, test)

1 个答案:

答案 0 :(得分:1)

首先创建自定义函数的参数输入列表

years = as.list(rep(year(min_date):year(max_date), 4))
quarters = as.list(rep(1:4, length(years) / 4))

param <- purrr::cross2(years, quarters)

然后创建一个自定义函数,将参数列表和数据作为输入

test <- function(param, data) {
  year    <- param[[1]]
  quarter <- param[[2]]

  min_start_date <- yq(paste(year, quarter)) - days(1)
  max_end_date <- yq(paste(year, quarter)) + months(3) - days(1)

  data <-
    data %>% transmute(
      !!paste(year, quarter) := case_when(
        # doa before start of period and dod after end of month (or missing dod) - end of month minus start of month
        (
          admission_date < min_start_date &
            discharge_date > max_end_date
        ) ~ time_length(min_start_date %--% max_end_date, "days"),
        # doa equal or greater to start of period (but within month) and dod after end of month (or missing dod)  -     end of month minus doa
        (
          admission_date >= min_start_date &
            admission_date <= max_end_date &
            discharge_date > max_end_date
        ) ~ time_length(admission_date %--% max_end_date, "days"),
        # doa on or before start of period and dod on or before end of month (but within month)  - dod minus start of     month
        (
          admission_date <= min_start_date &
            discharge_date <= max_end_date &
            discharge_date > min_start_date
        ) ~ time_length(min_start_date %--% discharge_date, "days"),
        # remainder - doa after start of period and dod on or before end of period  - dod minus doa
        (
          admission_date > min_start_date &
            discharge_date <= max_end_date
        ) ~ time_length(admission_date %--% discharge_date, "days"),
        TRUE ~ 0
      )
    )
  data
}

然后使用purrrfurrr

library(purrr)  

extract2 <- purrr::map_dfc(param, test, extract)


library(furrr)
plan(multicore(workers = 8))

extract3 <- furrr::future_map_dfc(param, test, extract)