我有一个数据框' DFrame'看起来像这样:
RecordNo | Cust_ID | Record_Date
1 | 023 | 2014-03-01
2 | 056 | 2014-01-18
3 | 041 | 2014-03-04
4 | 023 | 2014-03-21
5 | 056 | 2014-01-25
6 | 003 | 2014-03-01
7 | 023 | 2014-04-01
8 | 023 | 2014-04-02
我想添加一个列,显示同一客户ID的记录在当前record_date的后14天内显示的次数。
RecordNo | Cust_ID | Record_Date | 14-day_Repeat_Count
1 | 023 | 2014-03-01 | 0
2 | 056 | 2014-01-18 | 1
3 | 041 | 2014-03-04 | 0
4 | 023 | 2014-03-21 | 2
5 | 056 | 2014-01-25 | 0
6 | 003 | 2014-03-01 | 0
7 | 023 | 2014-04-01 | 1
8 | 023 | 2014-04-02 | 0
我正在尝试在R中编写快速代码来完成此任务。我发现了一些文章使得满足条件的计数记录看起来很容易,但它们通常只指向静态条件或与当前记录值无关的条件: http://one-line-it.blogspot.ca/2013/01/r-number-of-rows-matching-condition.html
我认为逻辑可能如下:
# Sort DFRAME by RECORD_DATE decreasing=FALSE
......
# Count records below current record where that have matching Cust_ID
# AND the Record_Date is <= 14 days of the current Record_Date
# put result into DFrame$14-day_Repeat_Count
......
我在DAX中完成了这种逻辑:
=calculate(counta([Cust_ID],
filter(DFrame,
[Cust_ID]=Earlier([Cust_ID]) &&
[Record_Date] > Earlier([Record_Date]) &&
[Record_Date] <= (Earlier([Record_Date]) + 14)
)
)
(非常快,但微软专有),在Excel中使用&#39; CountIfs&#39; (易于实现,非常非常慢,并且需要与微软结婚),是否有人可以根据R中的标准向我指出一些人可能会如何计算?
答案 0 :(得分:3)
为了简化这一过程,我创建了一些更实际的样本数据来重新创建问题。
##Make some sample variables
Record_Date <- as.Date(c(31,33,38,41,44,59,68,69,75,78,85,88,
32,34,45,46,51,54,60,65,67,70,74,80,
33,35,42,45,50,60,65,70,75,80,82,85),origin="2010-01-01")
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
##Combine into one data.table
library("data.table")
data <- data.table(Cust_ID,Record_Date)
Cust_ID Record_Date
1: 1 2010-02-01
2: 1 2010-02-03
3: 1 2010-02-08
4: 1 2010-02-11
5: 1 2010-02-14
6: 1 2010-03-01
7: 1 2010-03-10
8: 1 2010-03-11
9: 1 2010-03-17
10: 1 2010-03-20
11: 1 2010-03-27
12: 1 2010-03-30
13: 2 2010-02-02
14: 2 2010-02-04
15: 2 2010-02-15
16: 2 2010-02-16
17: 2 2010-02-21
我不确定R是否有一种非常有效的方式来做你想做的事情,但我认为使用data.table包可能是一个不错的选择。
output <- data[,as.list(data[,list(Cust_ID2=Cust_ID,Compare_Date=Record_Date)]),
by=c("Cust_ID","Record_Date")][Cust_ID==Cust_ID2 & Compare_Date>=Record_Date,list(Cust_ID,Record_Date,Compare_Date,
Within14=(as.numeric(Compare_Date-Record_Date)<=14)*1)][,list(Within14=(sum(Within14)-1)),by=c("Cust_ID","Record_Date")]
Cust_ID Record_Date Within14
1: 1 2010-02-01 4
2: 1 2010-02-03 3
3: 1 2010-02-08 2
4: 1 2010-02-11 1
5: 1 2010-02-14 0
6: 1 2010-03-01 2
7: 1 2010-03-10 3
8: 1 2010-03-11 2
9: 1 2010-03-17 3
10: 1 2010-03-20 2
11: 1 2010-03-27 1
12: 1 2010-03-30 0
13: 2 2010-02-02 3
14: 2 2010-02-04 2
15: 2 2010-02-15 3
16: 2 2010-02-16 3
17: 2 2010-02-21 3
对于您的特定数据,输出将如下所示:
##Combine into one data.table
Record_Date <- as.Date(c("2014-03-01","2014-01-18","2014-03-04","2014-03-21","2014-01-25","2014-03-01",
"2014-04-01","2014-04-02"))
Cust_ID <- c("023","056","041","023","056","003","023","023")
data <- data.table(Cust_ID,Record_Date)
output <- data[,as.list(data[,list(Cust_ID2=Cust_ID,Compare_Date=Record_Date)]),
by=c("Cust_ID","Record_Date")][Cust_ID==Cust_ID2 & Compare_Date>=Record_Date,list(Cust_ID,Record_Date,Compare_Date,
Within14=(as.numeric(Compare_Date-Record_Date)<=14)*1)][,list(Within14=(sum(Within14)-1)),by=c("Cust_ID","Record_Date")]
output
Cust_ID Record_Date Within14
1: 023 2014-03-01 0
2: 056 2014-01-18 1
3: 041 2014-03-04 0
4: 023 2014-03-21 2
5: 056 2014-01-25 0
6: 003 2014-03-01 0
7: 023 2014-04-01 1
8: 023 2014-04-02 0
答案 1 :(得分:3)
我不认为你会比Rcpp快得多。
首先,对数据框进行排序。
df = read.table(text="RecordNo Cust_ID Record_Date
1 023 2014-03-01
2 056 2014-01-18
3 041 2014-03-04
4 023 2014-03-21
5 056 2014-01-25
6 003 2014-03-01
7 023 2014-04-01
8 023 2014-04-02",header=TRUE,stringsAsFactors=FALSE)
df$Record_Date = as.Date(df$Record_Date)
df$Record_Date = as.numeric(df$Record_Date - min(df$Record_Date))
df = df[order(df$Record_Date),]
然后用Rcpp函数计算计数。
library(Rcpp)
cppFunction('
NumericVector count_14( NumericVector id, NumericVector day) {
unsigned int n=id.size(), i, j;
NumericVector out(n);
for( i=0; i<n; i++ ) {
j=1;
while( day[i+j] - day[i] <= 14 && i+j<n )
if( id[i]==id[i+j++] ) out[i]++;
}
return out;
}')
df$count = count_14(df$Cust_ID,df$Record_Date)
df[order(df$RecordNo),]
# RecordNo Cust_ID Record_Date count
# 1 1 23 42 0
# 2 2 56 0 1
# 3 3 41 45 0
# 4 4 23 62 2
# 5 5 56 7 0
# 6 6 3 42 0
# 7 7 23 73 1
# 8 8 23 74 0
答案 2 :(得分:3)
也许更快,更高效的内存可能看起来像这样:
##Combine into one data.table
library("data.table")
RecordNo <- 1:36
Record_Date <- c(31,33,38,41,44,59,68,69,75,78,85,88,
32,34,45,46,51,54,60,65,67,70,74,80,
33,35,42,45,50,60,65,70,75,80,82,85)
Cust_ID <- c(rep(1,12),rep(2,12),rep(3,12))
data <- data.table(Cust_ID,Record_Date)[order(Cust_ID,Record_Date)]
##Assign each customer an number that ranks them
data[,Cust_No:=.GRP,by=c("Cust_ID")]
##Create "list" of comparison dates for each customer
Ref <- data[,list(Compare_Date=list(I(Record_Date))), by=c("Cust_ID")]
##Compare two lists and see of the compare date is within N days
system.time(
data$Roll.Cnt <- mapply(FUN = function(RD, NUM) {
d <- as.numeric(Ref$Compare_Date[[NUM]] - RD)
sum((d > 0 & d <= 14))
}, RD = data$Record_Date,NUM=data$Cust_No)
)
结果数据如下所示:
data <- data[,list(Cust_ID,Record_Date,Roll.Cnt)][order(Cust_ID,Record_Date)]
data
Cust_ID Record_Date Roll.Cnt
1: 1 31 4
2: 1 33 3
3: 1 38 2
4: 1 41 1
5: 1 44 0
6: 1 59 2
7: 1 68 3
8: 1 69 2
9: 1 75 3
10: 1 78 2
11: 1 85 1
12: 1 88 0
13: 2 32 3
14: 2 34 2
15: 2 45 3
答案 3 :(得分:2)
您可以尝试使用具有O(n * log n)复杂度的findInterval()
:
DF <- read.csv(text=
'"RecordNo","Cust_ID","Record_Date"
1,"023","2014-03-01"
2,"056","2014-01-18"
3,"041","2014-03-04"
4,"023","2014-03-21"
5,"056","2014-01-25"
6,"003","2014-03-01"
7,"023","2014-04-01"
8,"023","2014-04-02"',
stringsAsFactors=F)
DF$Record_Date <- as.POSIXct(DF$Record_Date,format='%Y-%m-%d',tz='GMT')
# sort by ascending date
DF <- DF[order(DF$Record_Date),]
# for each date D we find the row index of the first next date <= D+14
DF$EndIdx <- findInterval(x=DF$Record_Date+14*60*60*24,vec=DF$Record_Date)
DF$Count <-
sapply(1:nrow(DF), FUN=function(i){
currRow <- DF[i,]
sum(DF[i:currRow$EndIdx,'Cust_ID'] == currRow$Cust_ID)
})
> DF
RecordNo Cust_ID Record_Date EndIdx Count
2 2 56 2014-01-18 2 2
5 5 56 2014-01-25 2 1
1 1 23 2014-03-01 5 1
6 6 3 2014-03-01 5 1
3 3 41 2014-03-04 5 1
4 4 23 2014-03-21 8 3
7 7 23 2014-04-01 8 2
8 8 23 2014-04-02 8 1
答案 4 :(得分:0)
这不是一种快速的方法,但它应该给某人一个开始。根据我的经验,R中的滚动计算很难快速完成。如果找不到聪明的解决方案,可能需要查看Rcpp
。
df = read.table(text="RecordNo Cust_ID Record_Date
1 023 2014-03-01
2 056 2014-01-18
3 041 2014-03-04
4 023 2014-03-21
5 056 2014-01-25
6 003 2014-03-01
7 023 2014-04-01
8 023 2014-04-02",header=TRUE,stringsAsFactors=FALSE)
# I don't like working with dates
# Converting to number of days after earliest day
df$Record_Date = as.Date(df$Record_Date)
df$Record_Date = as.numeric(df$Record_Date - min(df$Record_Date))
# Use a slow loop to do the calculations
for( i in 1:nrow(df) ) {
window = df$Record_Date < df$Record_Date[i] + 14 &
df$Record_Date > df$Record_Date[i]
df$count[i] = sum(df$Cust_ID[window] == df$Cust_ID[i])
}
df
# RecordNo Cust_ID Record_Date count
# 1 1 23 42 0
# 2 2 56 0 1
# 3 3 41 45 0
# 4 4 23 62 2
# 5 5 56 7 0
# 6 6 3 42 0
# 7 7 23 73 1
# 8 8 23 74 0