日期间隔删除

时间:2016-06-09 13:38:49

标签: r date

我的数据看起来像

ID  CLM_ID  Date1   Date2
1   718182  1/1/2014    1/17/2014
1   718184  1/2/2014    1/8/2014
1   885236  1/15/2014   1/17/2014
1   885362  3/20/2014   3/21/2014
2   589963  3/18/2015   3/22/2015
2   589999  2/27/2015   5/9/2015
2   594226  4/11/2015   4/17/2015
2   689959  5/10/2015   6/10/2015
3   656696  5/1/2016    5/5/2016
3   669625  5/6/2016    5/22/2016
4   777777  2/21/2015   3/4/2015
4   778952  2/1/2015    2/28/2015
4   778965  3/1/2015    3/22/2015

我正在研究两个不同的问题。第一篇文章在前一篇文章中回答了关于如何滚动日期(Date roll-up in R),第二篇文章是我有间隔时间间隔,我试图摆脱它们。所以最终的结果应该是

ID  CLM_ID  Date1   Date2
1   718182  1/1/2014    1/17/2014
1   885362  3/20/2014   3/21/2014
2   589999  2/27/2015   5/9/2015
3   656696  5/1/2016    5/22/2016
4   778952  2/1/2015    3/22/2015

现在我知道我必须首先通过日期汇总来创建扩展区间,但是我如何摆脱这些子区间(一个术语我在间隔内组成间隔)?我也在寻找一种有效的解决方案,因为我实际上有75,000条记录可以通过(即我试图避免迭代解决方案)。

3 个答案:

答案 0 :(得分:2)

&#34;我也在寻找高效的解决方案......(即我试图避免迭代解决方案)。&#34; < / p>

  

&#34;你的假设是你世界的窗户。每隔一段时间擦掉一次,否则光线不会进来。&#34; - 艾萨克·阿西莫夫

以下是超快速base R迭代解决方案。它几乎可以立即返回非常大的数据帧的正确结果。 (它还&#34;汇总&#34;数据,因此不需要执行两种算法):

MakeDFSubInt <- function(df, includeCost = FALSE) {
    ## Sorting the data frame to allow for fast
    ## creation of the "Contained" logical vector below
    tempDF <- df[order(df$ID, df$Date1, df$Date2), ] 
    UniIDs <- unique(tempDF$ID)
    Len <- length(UniIDs)

    ## Determine starting (i.e. "s") and ending (i.e. "e")
    ## points of the respective groups of IDs
    e <- which(diff(tempDF$ID)==1)
    s <- c(1L, e + 1L)
    dfLen <- nrow(tempDF)
    e <- c(e, dfLen)

    ## Converting dates to integers so that comparison
    ## will be faster. Internally dates are stored as
    ## integers, so this isn't a problem
    dte1 <- as.integer(tempDF$Date1)
    dte2 <- as.integer(tempDF$Date2)

    ## Building logical vector in order to quickly create sub-intervals
    Contained <- rep(FALSE, dfLen)

    BegTime <- Sys.time()  ## Included to measure time of for loop execution

    for (j in 1:Len) {
        Compare <- ifelse(dte2[s[j]] >= (dte1[s[j]+1L]+1L), max(dte2[s[j]], dte2[s[j]+1L]), dte2[s[j]+1L])
        for (x in (s[j]+1L):e[j]) {
            if (!Contained[x-1L]) {
                Contained[x] <- dte2[x-1L] >= (dte1[x]-1L)
            } else {
                Contained[x] <- Compare >= (dte1[x]-1L)
            }

            ## could use ifelse, but this construct is faster
            if (Contained[x]) {  
                Compare <- max(Compare, dte2[x])
            } else {
                Compare <- dte2[x]
            }
        }
    }

    EndTime <- Sys.time()
    TotTime <- EndTime - BegTime
    if (printTime) {print(paste(c("for loop execution time was: ", format(TotTime)), collapse = ""))}

    ## identify sub-intervals
    nGrps <- which(!Contained)

    ## Create New fields for our new DF
    ID <- tempDF$ID[nGrps]
    CLM_ID <- tempDF$CLM_ID[nGrps]
    Date1 <- tempDF$Date1[nGrps]
    nGrps <- c(nGrps, dfLen+1L)

    ## as.Date is converting numbers to dates. 
    ## N.B. This only works if origin is supplied
    Date2 <- as.Date(vapply(1L:(length(nGrps) - 1L), function(x) {
                     max(dte2[nGrps[x]:(nGrps[x+1L]-1L)])}, 1L), origin = "1970-01-01")

    ## in a related question the OP had, "Cost" was
    ## included to show how the algorithm would handle
    ## generic summary information
    if (includeCost) {
        myCost <- tempDF$Cost
        Cost <-  vapply(1L:(length(nGrps) - 1L), function(x) sum(myCost[nGrps[x]:(nGrps[x+1L]-1L)]), 100.01)
        NewDf <- data.frame(ID,CLM_ID,Date1,Date2,Cost)
    } else {
        NewDf <- data.frame(ID,CLM_ID,Date1,Date2)
    }

    NewDf
}

对于问题中给出的例子,我们有:

ID <- c(rep(1,4),rep(2,4),rep(3,2),rep(4,3))
CLM_ID <- c(718182, 718184, 885236, 885362, 589963, 589999, 594226, 689959, 656696, 669625, 777777, 778952, 778965)
Date1 <- c("1/1/2014","1/2/2014","1/15/2014","3/20/2014","3/18/2015","2/27/2015","4/11/2015","5/10/2015","5/1/2016","5/6/2016","2/21/2015","2/1/2015","3/1/2015")
Date2 <- c("1/17/2014","1/8/2014","1/17/2014","3/21/2014","3/22/2015","5/9/2015","4/17/2015","6/10/2015","5/5/2016","5/22/2016","3/4/2015","2/28/2015","3/22/2015")
myDF <- data.frame(ID, CLM_ID, Date1, Date2)
myDF$Date1 <- as.Date(myDF$Date1, format = "%m/%d/%Y")
myDF$Date2 <- as.Date(myDF$Date2, format = "%m/%d/%Y")

MakeDFSubInt(myDF)
ID CLM_ID      Date1      Date2
1  1 718182 2014-01-01 2014-01-17
2  1 885362 2014-03-20 2014-03-21
3  2 589999 2015-02-27 2015-06-10
4  3 656696 2016-05-01 2016-05-22
5  4 778952 2015-02-01 2015-03-22

从发布的OP similar question开始,我们可以添加Cost字段,以显示我们如何继续进行此设置的计算。

set.seed(7777)
myDF$Cost <- round(rnorm(13, 450, sd = 100),2)

MakeDFSubInt(myDF,  includeCost = TRUE)
ID   CLM_ID      Date1      Date2    Cost
1  1 718182 2014-01-01 2014-01-17 1164.66
2  1 885362 2014-03-20 2014-03-21  568.16
3  2 589999 2015-02-27 2015-06-10 2019.16
4  3 656696 2016-05-01 2016-05-22  990.14
5  4 778952 2015-02-01 2015-03-22 1578.68

此算法可以很好地扩展。对于OP正在寻找的大小的数据帧,几乎立即返回请求的DF返回,对于非常大的数据帧,它仅在几秒钟内返回。

首先,我们构建一个函数,生成一个n行的随机数据框。

MakeRandomDF <- function(n) {
    set.seed(109)

    CLM_Size <- ifelse(n < 10^6, 10^6, 10^(ceiling(log10(n))))
    numYears <- trunc((6/425000)*n + 5)
    StrtYear <- ifelse(numYears > 16, 2000, 2016 - numYears)
    numYears <- ifelse(numYears > 16, 16, numYears)

    IDs <- sort(sample(trunc(n/100), n, replace = TRUE))
    CLM_IDs <- sample(CLM_Size, n)
    StrtDate <- as.Date(paste(c(as.character(StrtYear),"-01-01"), collapse = ""))
    myPossibleDates <- StrtDate+(0:(numYears*365))  ## "numYears" years of data
    Date1 <- sample(myPossibleDates, n, replace = TRUE)
    Date2 <- Date1 + sample(1:100, n, replace = TRUE)
    Cost <- round(rnorm(n, 850, 100), 2)

    tempDF <- data.frame(IDs,CLM_IDs,Date1,Date2,Cost)
    tempDF$Date1 <- as.Date(tempDF$Date1, format = "%m/%d/%Y")
    tempDF$Date2 <- as.Date(tempDF$Date2, format = "%m/%d/%Y")

    tempDF
}

适用于中等规模的DF(即75,000行)

TestDF <- MakeRandomDF(75000)
system.time(test1 <- MakeDFSubInt(TestDF, includeCost = TRUE, printTime = TRUE))
[1] "for loop execution time was: 0.06500006 secs"
  user  system elapsed 
  0.14    0.00    0.14 

nrow(test1)
[1] 7618

head(test1)
  ID CLM_ID      Date1      Date2     Cost
1  1 116944 2010-01-29 2010-01-30   799.90  ## The range of dates for 
2  1 515993 2010-02-15 2011-10-12 20836.83  ## each row are disjoint
3  1 408037 2011-12-13 2013-07-21 28149.26  ## as requested by the OP
4  1  20591 2013-07-25 2014-03-11 10449.51
5  1 338609 2014-04-24 2014-07-31  4219.48
6  1 628983 2014-08-03 2014-09-11  2170.93


对于非常大的DF(即> 500,000行)

TestDF2 <- MakeRandomDF(500000)
system.time(test2 <- MakeDFSubInt(TestDF2, includeCost = TRUE, printTime = TRUE))
[1] "for loop execution time was: 0.3679998 secs"
  user  system elapsed 
  1.19    0.03    1.21 

nrow(test2)
[1] 154839

head(test2)
  ID CLM_ID      Date1      Date2    Cost
1  1  71251 2004-04-19 2004-06-29 2715.69  ## The range of dates for 
2  1 601676 2004-07-05 2004-09-23 2675.04  ## each row are disjoint
3  1 794409 2004-12-28 2005-04-05 1760.63  ## as requested by the OP
4  1 424671 2005-06-03 2005-08-20 1973.67
5  1 390353 2005-09-16 2005-11-06  785.81
6  1 496611 2005-11-21 2005-11-24  904.09

system.time(test3 <- MakeDFSubInt(TestDF3, includeCost = TRUE, printTime = TRUE))
[1] "for loop execution time was: 0.6930001 secs"
  user  system elapsed 
  2.68    0.08    2.79      ## 1 million rows in under 3 seconds!!!

nrow(test3)
[1] 413668


解释

算法的主要部分是生成Contained逻辑向量,用于确定连续日期的子区间。此向量的生成依赖于以下事实:数据框首先按ID排序,然后按Date1排序,最后按Date2排序。我们首先找到每组ID的起始行和结束行。例如,通过OP提供的示例,我们有:

myDF
   ID CLM_ID      Date1      Date2
1   1 718182 2014-01-01 2014-01-17    ## <- 1   s[1]
2   1 718184 2014-01-02 2014-01-08
3   1 885236 2014-01-15 2014-01-17
4   1 885362 2014-03-20 2014-03-21    ## <- 4   e[1]
5   2 589963 2015-03-18 2015-03-22    ## <- 5   s[2]
6   2 589999 2015-02-27 2015-05-09
7   2 594226 2015-04-11 2015-04-17
8   2 689959 2015-05-10 2015-06-10    ## <- 8   e[2]
9   3 656696 2016-05-01 2016-05-05    ## <- 9   s[3]
10  3 669625 2016-05-06 2016-05-22    ## <- 10  e[3]
11  4 777777 2015-02-21 2015-03-04    ## <- 11  s[4]
12  4 778952 2015-02-01 2015-02-28
13  4 778965 2015-03-01 2015-03-22    ## <- 13  e[4]

以下是生成se的代码。

## Determine starting (i.e. "s") and ending (i.e. "e")
## points of the respective groups of IDs
e <- which(diff(tempDF$ID)==1)
s <- c(1L, e + 1L)
dfLen <- nrow(tempDF)
e <- c(e, dfLen)

s
1  5  9   11

e
4  8  10  13

现在,我们遍历每个组并开始填充逻辑向量Contained。如果特定行的日期范围与其上方的日期范围重叠(或是其延续),我们会将Contained的特定索引设置为TRUE。这就是为什么每个组中的第一行设置为FALSE,因为上面没有任何内容可以将它与之进行比较。正如我们这样做的那样,我们正在更新与前进相比较的最大日期,因此Compare变量。应该注意的是Date2[n] < Date2[n+1L]并不一定正确,这就是Compare <- max(Compare, dte2[x])继承TRUEs的原因。我们的例子的结果如下所示。

   ID CLM_ID      Date1      Date2 Contained
1   1 718182 2014-01-01 2014-01-17     FALSE
2   1 718184 2014-01-02 2014-01-08      TRUE  ## These two rows are contained
3   1 885236 2014-01-15 2014-01-17      TRUE  ## in the date range 1/1 - 1/17
4   1 885362 2014-03-20 2014-03-21     FALSE  ## This row isn't
6   2 589999 2015-02-27 2015-05-09     FALSE
5   2 589963 2015-03-18 2015-03-22      TRUE
7   2 594226 2015-04-11 2015-04-17      TRUE
8   2 689959 2015-05-10 2015-06-10      TRUE  ## N.B. 5/10 is a continuance of 5/09
9   3 656696 2016-05-01 2016-05-05     FALSE
10  3 669625 2016-05-06 2016-05-22      TRUE
12  4 778952 2015-02-01 2015-02-28     FALSE
11  4 777777 2015-02-21 2015-03-04      TRUE
13  4 778965 2015-03-01 2015-03-22      TRUE

现在我们可以轻松识别&#34;开始&#34;行通过标识具有相应FALSE的所有行。在此之后,只需计算您感兴趣的任何内容(例如max(Date2)sum(Cost))就可以轻松找到摘要信息{/ 1}}和Voila !!

答案 1 :(得分:2)

使用current development version of data.table, v1.9.7

中的non-equi个联接
require(data.table) # v1.9.7+
dt[dt, .(CLM_IDs = CLM_IDs[.N==1L]), on=.(ID, Date1<=Date1, Date2>=Date2), by=.EACHI]
#    ID      Date1      Date2 CLM_ID
# 1:  1 2014-01-01 2014-01-17 718182
# 2:  1 2014-03-20 2014-03-21 885362
# 3:  2 2015-02-27 2015-05-09 589999
# 4:  2 2015-05-10 2015-06-10 689959
# 5:  3 2016-05-01 2016-05-05 656696
# 6:  3 2016-05-06 2016-05-22 669625
# 7:  4 2015-02-21 2015-03-04 777777
# 8:  4 2015-02-01 2015-02-28 778952
# 9:  4 2015-03-01 2015-03-22 778965

这样做,对于dt(方括号内的一行)中的每一行,它会根据提供给dt(在外部)的条件查找哪些行匹配。 on论证。

如果唯一匹配是自匹配,则返回匹配的行索引(因为条件也包括相等)。这是由CLM_IDs[.N == 1L]完成的,其中.N包含每个组的观察次数。

答案 2 :(得分:1)

这是一个不太漂亮的解决方案,将每一行与所有其他行的日期进行比较。我更正了一年3015到2015年。但结果与您的预期不同。要么我误解了你的问题,要么你误读了数据。

数据:

dta$Date1 <- as.Date(dta$Date1, format = "%m/%d/%Y")
dta$Date2 <- as.Date(dta$Date2, format = "%m/%d/%Y")

# Boolean vector to memorize results
keep <- logical(length = nrow(dta))
for(i in 1:nrow(dta)) {
  match <- dta[dta$Date1 <= dta$Date1[i] & dta$Date2 >= dta$Date2[i], ]
  if(nrow(match) == 1) keep[i] <- TRUE
}

# Result
dta[keep, ]

代码:

module FirstArgumentIsAString
  module Initializer
    def initialize(word)
      fail 'Word must be String' unless word.is_a?(String)
      super
    end
  end

  def self.included(klass)
    klass.send :prepend, Initializer
  end
end

class Foo
  include FirstArgumentIsAString
end

y = Foo.new(2)
> Uncaught exception: Word must be String