Question

我试图计算一个人使用R住在无家可归者收容所的时间。无家可归者收容所有两种不同类型的办理登机手续，一次是隔夜登记，另一次是长期登记。我想对数据进行整形以获得EntryDate和ExitDate，以便每次入住都没有至少一天的休息时间。

以下是目前的数据：

PersonalID        EntryDate        ExitDate
    1             2016-12-01       2016-12-02
    1             2016-12-03       2016-12-04
    1             2016-12-16       2016-12-17
    1             2016-12-17       2016-12-18
    1             2016-12-18       2016-12-19
    2             2016-10-01       2016-10-20
    2             2016-10-21       2016-10-22
    3             2016-09-01       2016-09-02
    3             2016-09-20       2016-09-21

最终，我试图让上述日期代表连续范围来计算参与者的总停留时间。

例如，上述数据将变为：

PersonalID      EntryDate   ExitDate
1               2016-12-01  2016-12-04
1               2016-12-16  2016-12-19
2               2016-10-01  2016-10-22
3               2016-09-01  2016-09-02
3               2016-09-20  2016-09-21

Answer 1

这是一个丑陋的解决方案。可能会做一些更干净的事情......但它确实有效。此解决方案也应该使用真实数据进行调试（我已经在您的exaple中添加了一行以获得更多不同的情况）

d <- read.table(text = '
PersonalID        EntryDate        ExitDate
    1             2016-12-01       2016-12-02
    1             2016-12-03       2016-12-04
    1             2016-12-16       2016-12-17
    1             2016-12-17       2016-12-18
    1             2016-12-18       2016-12-19
    2             2016-10-01       2016-10-20
    2             2016-10-21       2016-10-22
    3             2016-09-01       2016-09-02
    3             2016-09-20       2016-09-21
    4             2016-09-20       2016-09-21
', header = TRUE)

#' transorm in Date format
d$EntryDate <- as.Date(as.character(d$EntryDate))
d$ExitDate <- as.Date(as.character(d$ExitDate))
summary(d)

#' Reorder to be sure that the ExitDate / Entry date are in chronological order
d <- d[order(d$PersonalID, d$EntryDate),]
#' Add a column that will store the number of days between one exit and the next entry
d$nbdays <- 9999

# Split to have a list with dataframe for each ID
d <- split(d, d$PersonalID)
d

for(i in 1:length(d)) {

    # Compute number of days between one exit and the next entry (only if there are 
    # more than one entry)
    if(nrow(d[[i]])>1) {
        d[[i]][-1,"nbdays"] <- d[[i]][2:nrow(d[[i]]),"EntryDate"] - 
            d[[i]][1:(nrow(d[[i]])-1),"ExitDate"]
    }

    x <- d[[i]] # store a copy of the data to lighten the syntax

    # Entry dates for which the previous exit is higher than 1 day (including the first one)
    entr <- x[x$nbdays>1,"EntryDate"]

    # Exit dates just before cases where nbdays are > 1 and includes the last exit date. 
    # We use unique to avoid picking 2 times the last exit
    whichexist <- unique(c(c(which(x$nbdays > 1)-1)[-1],nrow(x)))
    exit <- x[whichexist,"ExitDate"]

    d[[i]] <- data.frame(
        PersonalID = x[1,1], 
        EntryDate = entr,
        ExitDate = exit
    )
}

# paste the elements of this list into one data.frame
do.call(rbind, d)

Answer 2

这是使用dplyr的解决方案。

library(dplyr)

d = structure(list(PersonalID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 
3L), EntryDate = structure(c(17136, 17138, 17151, 17152, 17153, 
17075, 17095, 17045, 17064), class = "Date"), ExitDate = structure(c(17137, 
17139, 17152, 17153, 17154, 17094, 17096, 17046, 17065), class = "Date")), class = "data.frame", .Names = c("PersonalID", 
"EntryDate", "ExitDate"), row.names = c(NA, -9L))

首先创建一个临时数据框，以保存进入和退出日期之间的所有日期：

d2 = d %>%
    rowwise() %>%
    do(data.frame(PersonalID = .$PersonalID, Present = seq(.$EntryDate, .$ExitDate, by = 'day'))) %>%
    unique %>% ## remove double dates when exit and re-entry occur on the same day
    ungroup()

然后从https://stackoverflow.com/a/14868742/827766

中寻找所有连续日期

d2 %>%
    group_by(PersonalID) %>%
    mutate(delta = c(1, diff(as.Date(Present)))) %>%
    group_by(PersonalID, stay = cumsum(delta!=1)) %>%
    summarize(EntryDate = min(Present), ExitDate = max(Present)) %>%
    subset(select = -c(stay))

在相同DF中整形间隔日期和范围日期

2 个答案: