我的数据看起来像
ID CLM_ID Date1 Date2
1 718182 1/1/2014 1/17/2014
1 718184 1/2/2014 1/8/2014
1 885236 1/15/2014 1/17/2014
1 885362 3/20/2014 3/21/2014
2 589963 3/18/2015 3/22/2015
2 589999 2/27/2015 5/9/2015
2 594226 4/11/2015 4/17/2015
2 689959 5/10/2015 6/10/2015
3 656696 5/1/2016 5/5/2016
3 669625 5/6/2016 5/22/2016
4 777777 2/21/2015 3/4/2015
4 778952 2/1/2015 2/28/2015
4 778965 3/1/2015 3/22/2015
我正在研究两个不同的问题。第一篇文章在前一篇文章中回答了关于如何滚动日期(Date roll-up in R),第二篇文章是我有间隔时间间隔,我试图摆脱它们。所以最终的结果应该是
ID CLM_ID Date1 Date2
1 718182 1/1/2014 1/17/2014
1 885362 3/20/2014 3/21/2014
2 589999 2/27/2015 5/9/2015
3 656696 5/1/2016 5/22/2016
4 778952 2/1/2015 3/22/2015
现在我知道我必须首先通过日期汇总来创建扩展区间,但是我如何摆脱这些子区间(一个术语我在间隔内组成间隔)?我也在寻找一种有效的解决方案,因为我实际上有75,000条记录可以通过(即我试图避免迭代解决方案)。
答案 0 :(得分:2)
&#34;我也在寻找高效的解决方案......(即我试图避免迭代解决方案)。&#34; < / p>
&#34;你的假设是你世界的窗户。每隔一段时间擦掉一次,否则光线不会进来。&#34; - 艾萨克·阿西莫夫
以下是超快速base R
迭代解决方案。它几乎可以立即返回非常大的数据帧的正确结果。 (它还&#34;汇总&#34;数据,因此不需要执行两种算法):
MakeDFSubInt <- function(df, includeCost = FALSE) {
## Sorting the data frame to allow for fast
## creation of the "Contained" logical vector below
tempDF <- df[order(df$ID, df$Date1, df$Date2), ]
UniIDs <- unique(tempDF$ID)
Len <- length(UniIDs)
## Determine starting (i.e. "s") and ending (i.e. "e")
## points of the respective groups of IDs
e <- which(diff(tempDF$ID)==1)
s <- c(1L, e + 1L)
dfLen <- nrow(tempDF)
e <- c(e, dfLen)
## Converting dates to integers so that comparison
## will be faster. Internally dates are stored as
## integers, so this isn't a problem
dte1 <- as.integer(tempDF$Date1)
dte2 <- as.integer(tempDF$Date2)
## Building logical vector in order to quickly create sub-intervals
Contained <- rep(FALSE, dfLen)
BegTime <- Sys.time() ## Included to measure time of for loop execution
for (j in 1:Len) {
Compare <- ifelse(dte2[s[j]] >= (dte1[s[j]+1L]+1L), max(dte2[s[j]], dte2[s[j]+1L]), dte2[s[j]+1L])
for (x in (s[j]+1L):e[j]) {
if (!Contained[x-1L]) {
Contained[x] <- dte2[x-1L] >= (dte1[x]-1L)
} else {
Contained[x] <- Compare >= (dte1[x]-1L)
}
## could use ifelse, but this construct is faster
if (Contained[x]) {
Compare <- max(Compare, dte2[x])
} else {
Compare <- dte2[x]
}
}
}
EndTime <- Sys.time()
TotTime <- EndTime - BegTime
if (printTime) {print(paste(c("for loop execution time was: ", format(TotTime)), collapse = ""))}
## identify sub-intervals
nGrps <- which(!Contained)
## Create New fields for our new DF
ID <- tempDF$ID[nGrps]
CLM_ID <- tempDF$CLM_ID[nGrps]
Date1 <- tempDF$Date1[nGrps]
nGrps <- c(nGrps, dfLen+1L)
## as.Date is converting numbers to dates.
## N.B. This only works if origin is supplied
Date2 <- as.Date(vapply(1L:(length(nGrps) - 1L), function(x) {
max(dte2[nGrps[x]:(nGrps[x+1L]-1L)])}, 1L), origin = "1970-01-01")
## in a related question the OP had, "Cost" was
## included to show how the algorithm would handle
## generic summary information
if (includeCost) {
myCost <- tempDF$Cost
Cost <- vapply(1L:(length(nGrps) - 1L), function(x) sum(myCost[nGrps[x]:(nGrps[x+1L]-1L)]), 100.01)
NewDf <- data.frame(ID,CLM_ID,Date1,Date2,Cost)
} else {
NewDf <- data.frame(ID,CLM_ID,Date1,Date2)
}
NewDf
}
对于问题中给出的例子,我们有:
ID <- c(rep(1,4),rep(2,4),rep(3,2),rep(4,3))
CLM_ID <- c(718182, 718184, 885236, 885362, 589963, 589999, 594226, 689959, 656696, 669625, 777777, 778952, 778965)
Date1 <- c("1/1/2014","1/2/2014","1/15/2014","3/20/2014","3/18/2015","2/27/2015","4/11/2015","5/10/2015","5/1/2016","5/6/2016","2/21/2015","2/1/2015","3/1/2015")
Date2 <- c("1/17/2014","1/8/2014","1/17/2014","3/21/2014","3/22/2015","5/9/2015","4/17/2015","6/10/2015","5/5/2016","5/22/2016","3/4/2015","2/28/2015","3/22/2015")
myDF <- data.frame(ID, CLM_ID, Date1, Date2)
myDF$Date1 <- as.Date(myDF$Date1, format = "%m/%d/%Y")
myDF$Date2 <- as.Date(myDF$Date2, format = "%m/%d/%Y")
MakeDFSubInt(myDF)
ID CLM_ID Date1 Date2
1 1 718182 2014-01-01 2014-01-17
2 1 885362 2014-03-20 2014-03-21
3 2 589999 2015-02-27 2015-06-10
4 3 656696 2016-05-01 2016-05-22
5 4 778952 2015-02-01 2015-03-22
从发布的OP similar question开始,我们可以添加Cost
字段,以显示我们如何继续进行此设置的计算。
set.seed(7777)
myDF$Cost <- round(rnorm(13, 450, sd = 100),2)
MakeDFSubInt(myDF, includeCost = TRUE)
ID CLM_ID Date1 Date2 Cost
1 1 718182 2014-01-01 2014-01-17 1164.66
2 1 885362 2014-03-20 2014-03-21 568.16
3 2 589999 2015-02-27 2015-06-10 2019.16
4 3 656696 2016-05-01 2016-05-22 990.14
5 4 778952 2015-02-01 2015-03-22 1578.68
此算法可以很好地扩展。对于OP正在寻找的大小的数据帧,几乎立即返回请求的DF返回,对于非常大的数据帧,它仅在几秒钟内返回。
首先,我们构建一个函数,生成一个n
行的随机数据框。
MakeRandomDF <- function(n) {
set.seed(109)
CLM_Size <- ifelse(n < 10^6, 10^6, 10^(ceiling(log10(n))))
numYears <- trunc((6/425000)*n + 5)
StrtYear <- ifelse(numYears > 16, 2000, 2016 - numYears)
numYears <- ifelse(numYears > 16, 16, numYears)
IDs <- sort(sample(trunc(n/100), n, replace = TRUE))
CLM_IDs <- sample(CLM_Size, n)
StrtDate <- as.Date(paste(c(as.character(StrtYear),"-01-01"), collapse = ""))
myPossibleDates <- StrtDate+(0:(numYears*365)) ## "numYears" years of data
Date1 <- sample(myPossibleDates, n, replace = TRUE)
Date2 <- Date1 + sample(1:100, n, replace = TRUE)
Cost <- round(rnorm(n, 850, 100), 2)
tempDF <- data.frame(IDs,CLM_IDs,Date1,Date2,Cost)
tempDF$Date1 <- as.Date(tempDF$Date1, format = "%m/%d/%Y")
tempDF$Date2 <- as.Date(tempDF$Date2, format = "%m/%d/%Y")
tempDF
}
适用于中等规模的DF(即75,000行)
TestDF <- MakeRandomDF(75000)
system.time(test1 <- MakeDFSubInt(TestDF, includeCost = TRUE, printTime = TRUE))
[1] "for loop execution time was: 0.06500006 secs"
user system elapsed
0.14 0.00 0.14
nrow(test1)
[1] 7618
head(test1)
ID CLM_ID Date1 Date2 Cost
1 1 116944 2010-01-29 2010-01-30 799.90 ## The range of dates for
2 1 515993 2010-02-15 2011-10-12 20836.83 ## each row are disjoint
3 1 408037 2011-12-13 2013-07-21 28149.26 ## as requested by the OP
4 1 20591 2013-07-25 2014-03-11 10449.51
5 1 338609 2014-04-24 2014-07-31 4219.48
6 1 628983 2014-08-03 2014-09-11 2170.93
对于非常大的DF(即> 500,000行)
TestDF2 <- MakeRandomDF(500000)
system.time(test2 <- MakeDFSubInt(TestDF2, includeCost = TRUE, printTime = TRUE))
[1] "for loop execution time was: 0.3679998 secs"
user system elapsed
1.19 0.03 1.21
nrow(test2)
[1] 154839
head(test2)
ID CLM_ID Date1 Date2 Cost
1 1 71251 2004-04-19 2004-06-29 2715.69 ## The range of dates for
2 1 601676 2004-07-05 2004-09-23 2675.04 ## each row are disjoint
3 1 794409 2004-12-28 2005-04-05 1760.63 ## as requested by the OP
4 1 424671 2005-06-03 2005-08-20 1973.67
5 1 390353 2005-09-16 2005-11-06 785.81
6 1 496611 2005-11-21 2005-11-24 904.09
system.time(test3 <- MakeDFSubInt(TestDF3, includeCost = TRUE, printTime = TRUE))
[1] "for loop execution time was: 0.6930001 secs"
user system elapsed
2.68 0.08 2.79 ## 1 million rows in under 3 seconds!!!
nrow(test3)
[1] 413668
算法的主要部分是生成Contained
逻辑向量,用于确定连续日期的子区间。此向量的生成依赖于以下事实:数据框首先按ID
排序,然后按Date1
排序,最后按Date2
排序。我们首先找到每组ID的起始行和结束行。例如,通过OP提供的示例,我们有:
myDF
ID CLM_ID Date1 Date2
1 1 718182 2014-01-01 2014-01-17 ## <- 1 s[1]
2 1 718184 2014-01-02 2014-01-08
3 1 885236 2014-01-15 2014-01-17
4 1 885362 2014-03-20 2014-03-21 ## <- 4 e[1]
5 2 589963 2015-03-18 2015-03-22 ## <- 5 s[2]
6 2 589999 2015-02-27 2015-05-09
7 2 594226 2015-04-11 2015-04-17
8 2 689959 2015-05-10 2015-06-10 ## <- 8 e[2]
9 3 656696 2016-05-01 2016-05-05 ## <- 9 s[3]
10 3 669625 2016-05-06 2016-05-22 ## <- 10 e[3]
11 4 777777 2015-02-21 2015-03-04 ## <- 11 s[4]
12 4 778952 2015-02-01 2015-02-28
13 4 778965 2015-03-01 2015-03-22 ## <- 13 e[4]
以下是生成s
和e
的代码。
## Determine starting (i.e. "s") and ending (i.e. "e")
## points of the respective groups of IDs
e <- which(diff(tempDF$ID)==1)
s <- c(1L, e + 1L)
dfLen <- nrow(tempDF)
e <- c(e, dfLen)
s
1 5 9 11
e
4 8 10 13
现在,我们遍历每个组并开始填充逻辑向量Contained
。如果特定行的日期范围与其上方的日期范围重叠(或是其延续),我们会将Contained
的特定索引设置为TRUE
。这就是为什么每个组中的第一行设置为FALSE
,因为上面没有任何内容可以将它与之进行比较。正如我们这样做的那样,我们正在更新与前进相比较的最大日期,因此Compare
变量。应该注意的是Date2[n] < Date2[n+1L]
并不一定正确,这就是Compare <- max(Compare, dte2[x])
继承TRUEs
的原因。我们的例子的结果如下所示。
ID CLM_ID Date1 Date2 Contained
1 1 718182 2014-01-01 2014-01-17 FALSE
2 1 718184 2014-01-02 2014-01-08 TRUE ## These two rows are contained
3 1 885236 2014-01-15 2014-01-17 TRUE ## in the date range 1/1 - 1/17
4 1 885362 2014-03-20 2014-03-21 FALSE ## This row isn't
6 2 589999 2015-02-27 2015-05-09 FALSE
5 2 589963 2015-03-18 2015-03-22 TRUE
7 2 594226 2015-04-11 2015-04-17 TRUE
8 2 689959 2015-05-10 2015-06-10 TRUE ## N.B. 5/10 is a continuance of 5/09
9 3 656696 2016-05-01 2016-05-05 FALSE
10 3 669625 2016-05-06 2016-05-22 TRUE
12 4 778952 2015-02-01 2015-02-28 FALSE
11 4 777777 2015-02-21 2015-03-04 TRUE
13 4 778965 2015-03-01 2015-03-22 TRUE
现在我们可以轻松识别&#34;开始&#34;行通过标识具有相应FALSE
的所有行。在此之后,只需计算您感兴趣的任何内容(例如max(Date2)
,sum(Cost)
)就可以轻松找到摘要信息{/ 1}}和Voila !!
答案 1 :(得分:2)
使用current development version of data.table, v1.9.7,
中的non-equi
个联接
require(data.table) # v1.9.7+
dt[dt, .(CLM_IDs = CLM_IDs[.N==1L]), on=.(ID, Date1<=Date1, Date2>=Date2), by=.EACHI]
# ID Date1 Date2 CLM_ID
# 1: 1 2014-01-01 2014-01-17 718182
# 2: 1 2014-03-20 2014-03-21 885362
# 3: 2 2015-02-27 2015-05-09 589999
# 4: 2 2015-05-10 2015-06-10 689959
# 5: 3 2016-05-01 2016-05-05 656696
# 6: 3 2016-05-06 2016-05-22 669625
# 7: 4 2015-02-21 2015-03-04 777777
# 8: 4 2015-02-01 2015-02-28 778952
# 9: 4 2015-03-01 2015-03-22 778965
这样做,对于dt
(方括号内的一行)中的每一行,它会根据提供给dt
(在外部)的条件查找哪些行匹配。 on
论证。
如果唯一匹配是自匹配,则返回匹配的行索引(因为条件也包括相等)。这是由CLM_IDs[.N == 1L]
完成的,其中.N
包含每个组的观察次数。
答案 2 :(得分:1)
这是一个不太漂亮的解决方案,将每一行与所有其他行的日期进行比较。我更正了一年3015到2015年。但结果与您的预期不同。要么我误解了你的问题,要么你误读了数据。
数据:
dta$Date1 <- as.Date(dta$Date1, format = "%m/%d/%Y")
dta$Date2 <- as.Date(dta$Date2, format = "%m/%d/%Y")
# Boolean vector to memorize results
keep <- logical(length = nrow(dta))
for(i in 1:nrow(dta)) {
match <- dta[dta$Date1 <= dta$Date1[i] & dta$Date2 >= dta$Date2[i], ]
if(nrow(match) == 1) keep[i] <- TRUE
}
# Result
dta[keep, ]
代码:
module FirstArgumentIsAString
module Initializer
def initialize(word)
fail 'Word must be String' unless word.is_a?(String)
super
end
end
def self.included(klass)
klass.send :prepend, Initializer
end
end
class Foo
include FirstArgumentIsAString
end
y = Foo.new(2)
> Uncaught exception: Word must be String