Question

我的问题扩展了这个问题：Generate rows between two dates into a data frame in R

我有一个关于医院患者入院、出院和住院时间 (Stay_in_days) 的数据集。它看起来像这样：

ID Admission           Discharge             Stay_in_days 
1    2020-08-20 15:25:03 2020-08-21 21:09:34 1.239
2    2020-10-04 21:53:43 2020-10-09 11:02:57 4.548
...

到目前为止，日期采用 POSIXct 格式。

我的目标是：

ID   Date                 Stay_in_days 
1    2020-08-20 15:25:03  0.357 
1    2020-08-21 21:09:49  1.239
2    2020-10-04 21:53:43  0.087
2    2020-10-05 00:00:00  1.087
2    2020-10-06 00:00:00  2.087
2    2020-10-07 00:00:00  3.087
2    2020-10-08 00:00:00  4.087
2    2020-10-09 11:02:57  4.548
...

到目前为止我做了什么：

M <- Map(seq, patients$Admission, patients$Discharge, by = "day")
patients2 <- data.frame(
  ID = rep.int(patients$ID, vapply(M, length, 1L)), 
  Date = do.call(c, M)
) 

patients <- patients %>%
mutate(
 Date2=as.Date(Date, format = "%Y-%m-%d"),
 Dat2=Date2+1,
 Diff=difftime(Date2, Date, units = "days")
)

但这给了我：

ID   Date                 Date2      Diff
1    2020-08-20 17:25:03  2020-08-21 0.375
1    2020-08-21 17:25:03  2020-08-22 0.357
2    2020-10-04 23:53:43  2020-10-05 0.087
2    2020-10-05 23:53:43  2020-10-06 0.087
2    2020-10-06 23:53:43  2020-10-07 0.087
2    2020-10-07 23:53:43  2020-10-08 0.087
2    2020-10-08 23:53:43  2020-10-09 0.087
...

奇怪的是，它在入学日期上增加了两个小时，但计算出正确的逗留时间。谁能解释一下？

这是一些数据：

structure(list(ID = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
13, 14, 15, 16, 17, 18, 19, 20), Admission = structure(c(1597937103.872, 
1598717768.704, 1599060521.984, 1599758087.168, 1599815496.704, 
1600702198.784, 1600719631.36, 1601065923.584, 1601119400.96, 
1601215476.736, 1601236710.4, 1601416934.4, 1601499640.832, 1601545647.104, 
1601587328, 1601644868.608, 1601741206.528, 1601848423.424, 1601901245.44, 
1601913828.352), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    Discharge = structure(c(1598044189.696, 1598897337.344, 1599144670.208, 
    1599845118.976, 1599842366.464, 1602733683.712, 1603372135.424, 
    1601125168.128, 1601314173.952, 1605193905.152, 1602190259.2, 
    1601560720.384, 1601737143.296, 1602705634.304, 1602410460.16, 
    1602698425.344, 1601770566.656, 1602241377.28, 1602780476.416, 
    1602612048.896), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    Stay_in_days = c(1.239, 2.078, 0.974, 1.007, 0.311, 23.513, 
    30.7, 0.686, 2.254, 46.047, 11.036, 1.664, 2.749, 13.426, 
    9.527, 12.194, 0.34, 4.548, 10.176, 8.081)), row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"))

预先感谢您的帮助！

Answer 1

您可以使用 tidyr 中的 pivot_longer 实现此目的。编辑：有评论：

df1 <- df %>% 
  select(ID = ID, date1 = Admission, date2 = Discharge, Stay_in_days) %>% # prepare for pivoting
  pivot_longer(
    cols = starts_with("date"),
    names_to = "Date1",
    values_to = "Date",
  ) %>% # pivot to longformat
  select(-Date1) %>% # remove temporary Date1
  relocate(Stay_in_days, .after = Date) %>% # change column order
  group_by(ID) %>%
  mutate(idgroup = rep(row_number(), each=1:2, length.out = n())) %>% # id for admission = 1 and for discharge id = 2
  mutate(Stay_in_days = replace(Stay_in_days, row_number() == 1, 0)) %>%  # set Admission to zero
  ungroup()

Answer 2

虽然有点粗糙，但它会起作用

library(tidyverse)
library(lubridate)

df %>% 
  pivot_longer(cols = -c(ID, Stay_in_days), names_to = "Event", values_to = "DATE") %>%
  group_by(ID) %>%
  mutate(dummy = case_when(Event == "Admission" ~ 0,
                           Event == "Discharge" ~ max(floor(Stay_in_days),1))) %>%
  complete(dummy = seq(min(dummy), max(dummy), 1)) %>%
  mutate(Event = ifelse(is.na(Event), "Dummy", Event),
         DATE = if_else(is.na(DATE), first(DATE)+dummy*24*60*60, DATE),
         Stay_in_days = case_when(Event == "Admission" ~ as.numeric(difftime(ceiling_date(DATE, "day"), DATE, units = "days")),
                                   Event == "Discharge" ~ Stay_in_days,
                                   TRUE ~ dummy + as.numeric(difftime(ceiling_date(first(DATE), "day"), first(DATE), units = "days")))) %>%
  select(ID, DATE, Stay_in_days)

# A tibble: 199 x 3
# Groups:   ID [20]
      ID DATE                Stay_in_days
   <dbl> <dttm>                     <dbl>
 1     1 2020-08-20 15:25:03        0.358
 2     1 2020-08-21 21:09:49        1.24 
 3     2 2020-08-29 16:16:08        0.322
 4     2 2020-08-30 16:16:08        1.32 
 5     2 2020-08-31 18:08:57        2.08 
 6     3 2020-09-02 15:28:41        0.355
 7     3 2020-09-03 14:51:10        0.974
 8     4 2020-09-10 17:14:47        0.281
 9     4 2020-09-11 17:25:18        1.01 
10     5 2020-09-11 09:11:36        0.617
# ... with 189 more rows

逻辑说明对于每个 ID 中的第一个日期，stay_in_days 给出了从入场日期-时间到接下来的 24 小时的持续时间。对于中间日期，它只是将 1 添加到以前的值。对于 discharge_date，它保留旋转前计算的停留值。希望这是你所追求的。

代码说明 旋转更长的时间后，我使用了一个虚拟列来创建中间日期时间对象。之后，我只需 mutate 列用于生成如上所述的输出。

R：根据开始和结束日期展开行并计算天之间的小时数

2 个答案: