我必须将开始日期和结束日期分成30天的间隔,并派生两个新变量“拆分开始日期”和“拆分结束日期”。
我尝试了很多东西,但没有用......
df_actual <- data.frame(
order_id = "a",
start_date="01/05/2017",
ends_with="06/07/2017"
)
df_expected <- data.frame(
order_id = rep("a", 3),
start_date = rep("01/05/2017", 3),
ends_with = rep("06/07/2017", 3),
split_start_date = c("01/05/2017", "31/05/2017", "30/06/2017"),
split_end_date = c("30/05/2017", "29/06/2017", "06/07/2017")
)
答案 0 :(得分:0)
我增加了您的示例以包含2个order_id,以查看如果ID超过1,会发生什么。
我创建了一个名为my_fun的函数,请参阅section function。在这个函数中,我取start_date并根据start_date和end_withs之间的差除以持续时间创建新的start_dates。这给出了需要创建的新日期。在创建split_start日期之后,split_end日期只是一个填充练习,请注意扣除1天以获得示例输出。
将map2
与setNames
和bind_rows
一起使用会创建一个data.frame,其中包含新日期和order_id作为标识符。然后可以将其与原始表连接以创建预期输出。
添加了代码来处理日期中的NA
等边缘情况或者start_date等于ends_with。
library(lubridate)
library(dplyr)
library(purrr)
df_actual %>%
inner_join(map2(df_actual$start_date, df_actual$ends_with, my_fun) %>%
setNames(., df_actual$order_id) %>%
bind_rows(., .id = "order_id"))
order_id start_date ends_with split_start split_end
1 a 2017-05-01 <NA> <NA> <NA>
2 b 2016-05-01 2016-07-06 2016-05-01 2016-05-30
3 b 2016-05-01 2016-07-06 2016-05-31 2016-06-29
4 b 2016-05-01 2016-07-06 2016-06-30 2016-07-06
5 c 2017-07-01 2017-07-01 2017-07-01 2017-07-01
6 d <NA> 2017-07-01 <NA> <NA>
功能:
my_fun <- function(x, y, duration = 30) {
d <- ddays(duration)
if(is.na(x) | is.na(y)) split_start <- split_end <- NA else
if(x == y) {
split_start <- x
split_end <- y
} else {
n <- (y - x) %/% d
split_start <- c(rep(x, n + 1))
for(i in 1:n+1){
split_start[i] <- x + (i - 1) * d
}
split_end <- split_start[1:n+1]
split_end <- split_end - ddays(1)
split_end[n+1] <- y
}
df <- data.frame(split_start, split_end)
return(df)
}
数据:
df_actual <- structure(list(order_id = c("a", "b", "c", "d"), start_date = structure(c(17287,
16922, 17348, NA), class = "Date"), ends_with = structure(c(NA,
16988, 17348, 17348), class = "Date")), class = "data.frame", row.names = c(NA,
-4L))
答案 1 :(得分:0)
library(lubridate)
df_actual<-data.frame(order_id="a",start_date="01/05/2017",ends_with="06/07/2017")
start_date <- dmy(as.character(df_actual[1,2]))
ends_with <- dmy(as.character(df_actual[1,3]))
day_number <- as.numeric(difftime(ends_with, start_date))
batch_duration = 30
no_batches <- ceiling(day_number / batch_duration)
remainder <- day_number %% batch_duration + 1#for last day
batches <- rep(batch_duration, no_batches)
if(remainder > 0){
batches[no_batches] <- remainder
}
new_cols <- data.frame(split_start_date=character(), split_end_date=character())
prev_end_date <- start_date - 1
for(i in 1:no_batches){
split_end_date <- start_date + (i-1) * batch_duration + (batches[i]-1)
split_start_date <- prev_end_date + 1
prev_end_date <- split_end_date
data_frame_row <- data.frame(split_start_date, split_end_date)
new_cols <- rbind(new_cols, data_frame_row)
}
# print(new_cols)
print(cbind(df_actual, new_cols))