Question

我有一家医院的数据，变量很多，每行的日期和日期也如此，这告诉我们每行何时“有效”。每行最多可以有效一年。

test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
                  Payment=c(110,110,115,240,255), 
                  from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
                  to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
                  )

对于从一年到另一年的行，我想将这些行拆分，以使我最终得到两行而不是原始行，并且还要操纵from_date和to_date，最终得到像这样的新数据集：

  test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
                              Payment=c(110,110,110,110,115,115,240,255,255), 
                              from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
                              to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
    )

尝试：

library(lubridate) #for function "year" below
test_desired=test
row=c()
tmp=c()
for(i in 1:nrow(test_desired)){
  if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
    test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
    row = test_desired[i,]
    row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
    row$to_date = test$to_date[i]
    tmp=rbind(tmp,row)

  } else next
}
test_desired=rbind(test_desired,tmp)
library(dplyr)
test_desired=arrange(test_desired,ID,from_date)

例如，使用dplyr是否有更优雅的方法？

Answer 1

这是基于整洁的解决方案。它与Lennyy的相似，但条件检查较少，并且添加时间没有问题（它们可能会出现小标题，但显示为00:00:00）。我添加了ungroup()，因为听起来您在某个地方有一个分组变量（在Lennyy解决方案下的注释）。如果不这样做，可以将其删除：

library(dplyr)
library(lubridate)
library(purrr)

test %>% 
    ungroup() %>% # This isn't necessary if there are no groupings.
    split(rownames(test)) %>% 
    map_dfr(function(df){
        if (year(df$from_date) == year(df$to_date)) return(df)
        bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
                  mutate(df, from_date = floor_date(to_date, "y"))
                  )
    }
    )

#### OUTPUT ####

  ID Disease Pass Payment  from_date    to_date
1 10       P   US     110 2008-01-09 2008-12-31
2 10       P   US     110 2009-01-01 2009-01-08
3 10       P   US     110 2009-01-09 2009-12-31
4 10       P   US     110 2010-01-01 2010-01-08
5 10       P   US     115 2010-01-09 2010-12-31
6 10       P   US     115 2011-01-01 2011-01-08
7 12       D   EN     240 2008-01-01 2008-12-31
8 12       P   EN     255 2013-12-31 2013-12-31
9 12       P   EN     255 2014-01-01 2014-12-30

说明：数据帧分为行列表。然后，我使用map_dfr在其中from_date和to_date包含不同年份的每个数据帧上运行该函数。 map_dfr还将结果数据帧绑定在一起。在匿名函数中，我按年份对to_date进行下限设置，然后将其回滚到第一个月新to_date的上个月的最后一天，或者将其保留为在第二行中添加新的from_date。

Answer 2

使用 from_date 和 to_date ，我们可以使用seq.Date创建日期序列，然后按年份拆分该序列，最后选择每年的最小值和最大值。然后使用apply，separate_rows和separate获得最终结果。

cr_date <- function(d1, d2){
    #browser()
    sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day') 
    lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
                        function(x) paste0(min(x), '|', max(x)))
    result <- paste0(lst_dates, collapse = ';')
    return(result)
  }

#Test
#cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))

library(tidyr)
separate_rows(test, flag, sep=';') %>% 
  separate(flag, into = c('from_date_new','to_date_new'), '\\|') %>% 
  mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))


    ID Disease Pass Payment  from_date    to_date from_date_new to_date_new
  1 10       P   US     110 2008-01-09 2009-01-08    2008-01-09  2008-12-31
  2 10       P   US     110 2008-01-09 2009-01-08    2009-01-01  2009-01-08
  3 10       P   US     110 2009-01-09 2010-01-08    2009-01-09  2009-12-31
  4 10       P   US     110 2009-01-09 2010-01-08    2010-01-01  2010-01-08
  5 10       P   US     115 2010-01-09 2011-01-08    2010-01-09  2010-12-31
  6 10       P   US     115 2010-01-09 2011-01-08    2011-01-01  2011-01-08
  7 12       D   EN     240 2008-01-01 2008-12-31    2008-01-01  2008-12-31
  8 12       P   EN     255 2013-12-31 2014-12-30    2013-12-31  2013-12-31
  9 12       P   EN     255 2013-12-31 2014-12-30    2014-01-01  2014-12-30

Answer 3

这仅使用基数R。

首先请注意，仅使用没有时间的日期，因此我们应该使用Date类，而不是POSIXct。除非您非常小心，否则后者可能会不必要地引入时区错误，因此在最后的注释中显示了所使用的输入，我们假设我们从包含test2类数据的Date开始。注意中的代码还显示了如何将其转换为Date类的POSIXct类。

鉴于test2，我们添加了from_year，to_year和eoy（到年底的日期）列，得到test3。然后，我们对行进行迭代，如果年份相同，则返回该行，如果不相同，则返回拆分的行。这给出了一列和两行数据帧的列表，我们一起rbind。

test3 <- transform(test2, 
  from_year = format(from_date, "%Y"),
  to_year = format(to_date, "%Y"),
  eoy = as.Date(sub("-.*", "-12-31", from_date)))

nr <- nrow(test2)
do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
  if (from_year == to_year) test2[i, ]
  else data.frame(ID, Disease, Pass, Payment, 
      from_date = c(from_date, eoy+1),
      to_date = c(eoy, to_date)))
))

注意

假定输入为可复制形式。如上所述，它使用Date类。

test2 <- transform(test, 
  from_date = as.Date(from_date),
  to_date = as.Date(to_date))

Answer 4

我正在使用data.table，它也提供了year功能并使用as.POSIXct忽略可能的慢日期转换逻辑。

我还假设to_date和from_date可能相差仅一年（不超过一年！）。

library(data.table)  # also provides a "year" function

setDT(test)

# Create additional rows for the new year
additional_rows <- test[year(from_date) < year(to_date), ]
additional_rows[, from_date := as.POSIXct(paste0(year(to_date),"-01-01"))]

# Shorten the "from_date" of the affected original rows
test[year(from_date) < year(to_date), to_date := as.POSIXct(paste0(year(from_date),"-12-31"))]

# Create a combined data table as result
result <- rbind(test, additional_rows)
setkey(result, ID, Payment, from_date)  # just to sort the data like the "test_desired" sort order

结果

> result
   ID Disease Pass Payment  from_date    to_date
1: 10       P   US     110 2008-01-09 2008-12-31
2: 10       P   US     110 2009-01-01 2009-01-08
3: 10       P   US     110 2009-01-09 2009-12-31
4: 10       P   US     110 2010-01-01 2010-01-08
5: 10       P   US     115 2010-01-09 2010-12-31
6: 10       P   US     115 2011-01-01 2011-01-08
7: 12       D   EN     240 2008-01-01 2008-12-31
8: 12       P   EN     255 2013-12-31 2013-12-31
9: 12       P   EN     255 2014-01-01 2014-12-30

Answer 5

您也可以使用dplyr和lubridate尝试以下类似方法。它的工作方式如下：1.使用rbind复制数据帧。 2.首先在ID上排列，然后在from_date上排列，然后按照test中给定的行顺序排列第三。 3.在偶数行中，将from_date更改为新年的第一天。 4.在奇数行中，将to_date更改为上一年的最后一天。 5.最后，排除from_date和to_date之间的差异仅1秒的行。

test %>% 
  rbind(test) %>% 
  arrange(ID, from_date) %>% 
  mutate(from_date = if_else(row_number() %% 2 == 0, ceiling_date(from_date, "year") + 1, from_date),
         to_date = if_else(row_number() %% 2 == 1, floor_date(to_date, "year") - 1, to_date)) %>% 
  filter(from_date - to_date != 1)

  ID Disease Pass Payment           from_date             to_date
1 10       P   US     110 2008-01-09 00:00:00 2008-12-31 23:59:59
2 10       P   US     110 2009-01-01 00:00:01 2009-01-08 00:00:00
3 10       P   US     110 2009-01-09 00:00:00 2009-12-31 23:59:59
4 10       P   US     110 2010-01-01 00:00:01 2010-01-08 00:00:00
5 10       P   US     115 2010-01-09 00:00:00 2010-12-31 23:59:59
6 10       P   US     115 2011-01-01 00:00:01 2011-01-08 00:00:00
7 12       D   EN     240 2008-01-01 00:00:01 2008-12-31 00:00:00
8 12       P   EN     255 2013-12-31 00:00:00 2013-12-31 23:59:59
9 12       P   EN     255 2014-01-01 00:00:01 2014-12-30 00:00:00

唯一的缺点可能是增加了时间，但是您当然可以删除这些时间。而且如果某个时期可能会持续到第三年，则可以使用相同的逻辑，但是第二个rbind和row_number() %% 3 == 0

按新年拆分日期行

5 个答案:

注意