按日期生成排名,然后转移到第二个数据框

时间:2014-10-24 16:02:23

标签: r date

我想生成一个包含基于另一列中事件的索引的列,其中索引按日期排序。对于其他事件,将根据日期范围分配索引。索引应按单位ID分组。

数据由两个不同数据框的单位,事件和日期组成。一个数据框有一个名为"整个文件"的事件。当"整个文件"事件发生时,等级应该递增。然后,必须将排名转移到该单位的其他行以及该日期范围内。如果在第一个"整个文件"之前有事件。事件,这些事件的排名应为0。

在第一个数据框中,其中包含"整个文件"事件,如果单位编号改变,并且第一个事件不是"整个文件",则开始事件具有前一个单元编号的最后一个等级。

然后必须按单位/日期范围将等级转移到后续数据帧。这就是我被困的地方

希望这个例子让问题清楚。

生成样本数据

set.seed(13)

# Ref http://stackoverflow.com/questions/14720983/efficiently-generate-a-random-sample-of-times-and-dates-between-two-dates

rDates <- function(N, st="2014/01/01", et="2014/10/01") {
  st <- as.POSIXct(as.Date(st))
  et <- as.POSIXct(as.Date(et))
  dt <- as.numeric(difftime(et,st,unit="sec"))
  ev <- sort(runif(N, 0, dt))
  rt <- st + ev
}

nSamples_df1=100
nSamples_df2=75

df1<-data.frame(Event.Name=sample(c("Entire File",paste("Event ",letters[1:5])),nSamples_df1,replace=TRUE,prob=c(3,rep(1,5))),
                unit=sample(1:10,nSamples_df1,replace=TRUE),
                event_time = rDates(nSamples_df1))

df2<-data.frame(event=sample(c(paste("Event ",letters[6:10])),nSamples_df2,replace=TRUE),
                unit=sample(1:10,nSamples_df2,replace=TRUE),
                event_time = rDates(nSamples_df2))

第一步是订购数据,然后对&#34;整个文件&#34;进行排名。 df1中的事件。

# Put df1 in order
df1<-with(df1,head(df1[order(unit,event_time),],50))
# Extract and rank the "Entire File" events
entireFileEvents <- df1[df1$Event.Name=="Entire File",
                        c("Event.Name","unit","event_time")]
rankedEntireFileEvents <- transform(entireFileEvents, 
                                    fileEventIndex = ave(xtfrm(event_time), unit, 
                                                     FUN = function(x) rank(x, ties.method = "first")))

将数据传输到原始数据框。我不确定这是否正确,排名是否最终位于正确的位置?

df1$fileEventIndex=NA

# Original risky assignment
# df1[df1$Event.Name=="Entire File","fileEventIndex"] <- rankedEntireFileEvents$fileEventIndex
# I'm not sure how to use merge in this case
# df1b <- merge(df1, rankedEntireFileEvents, by=c("Event.Name","unit","event_time"), sort = FALSE)
# Assignment using match, thanks akrun
match_rows <- match(paste(df1$Event.Name, df1$unit, df1$event_time),
      paste(rankedEntireFileEvents$Event.Name, rankedEntireFileEvents$unit, rankedEntireFileEvents$event_time))
df1_match_rows = which(!is.na(match_rows))
refe_match_rows = match_rows[!is.na(match_rows)]
df1[df1_match_rows,"fileEventIndex"] <- rankedEntireFileEvents$fileEventIndex[refe_match_rows]

使用zoo

填写剩余的排名
library(zoo)
df1<-na.locf(df1, na.rm = FALSE)
df1$fileEventIndex[is.na(df1$fileEventIndex)]=0

现在,我不知道如何将filesEventIndex从rankingEntireFileEvents传输到第二个数据帧。对于单位,如果日期较晚,则fileEventIndexin df2应具有相同的值。

以下是df1的当前结果。排名不正确,因为单位在&#34;整个文件&#34;之前发生了变化。发生了,因此第2单元的第一个事件排名为4,并且应该排名为0。

> with(df1,head(df1[order(unit,event_time),],50))
     Event.Name unit          event_time fileEventIndex
6   Entire File    1 2014-01-09 01:43:24              1
12     Event  a    1 2014-01-23 10:25:59              1
26     Event  c    1 2014-02-26 16:51:07              1
28     Event  b    1 2014-03-04 05:39:57              1
47  Entire File    1 2014-05-05 02:19:16              2
67  Entire File    1 2014-07-01 18:52:56              3
76     Event  a    1 2014-07-21 03:42:14              3
82     Event  b    1 2014-08-07 16:33:33              3
87     Event  a    1 2014-08-22 01:04:39              3
89  Entire File    1 2014-08-30 15:42:21              4
94     Event  a    1 2014-09-07 13:46:25              4
8      Event  e    2 2014-01-12 23:49:24              4   <-- This should be 1
16  Entire File    2 2014-01-27 10:20:28              1
21  Entire File    2 2014-02-11 17:24:22              2
22     Event  c    2 2014-02-21 22:32:28              2

这是第二个df的方法,但结果不正确

df2$fileEventIndex=NA
units <- sort(unique(rankedEntireFileEvents$unit))

for (iu in seq(1,length(units))) {
  uu = units[iu]
  rankSameUnit = rankedEntireFileEvents$unit==uu
  dfSameUnit = df2$unit == uu
  uDates <- rankedEntireFileEvents[rankSameUnit,"event_time"]
  uFileEventIndex <-  rankedEntireFileEvents[rankSameUnit,"fileEventIndex"]
  nDates = length(uDates)
  if (nDates>0) {
    dfBeforeFirstDate = df2$event_time < uDates[1]
    df2_rows = dfSameUnit & dfBeforeFirstDate
    if (any(df2_rows)) {
      df2[df2_rows, "fileEventIndex"] = 0
    }
    for (id in seq(1,nDates-1)) {
      dfAfterCurrentDate = df2$event_time >= uDates[id]
      dfBeforeNextDate = df2$event_time < uDates[id]
      currentRank = uFileEventIndex[id]
      df2_rows = dfSameUnit & dfAfterCurrentDate & dfBeforeNextDate
      if (any(df2_rows)) {
        df2[df2_rows, "fileEventIndex" ] = currentRank
      }
    }
    dfAfterLastDate = df2$event_time >= uDates[nDates]
    df2_rows = dfSameUnit & dfAfterLastDate
    if (any(df2_rows)) {
      df2[df2_rows, "fileEventIndex"] = uFileEventIndex[nDates]  
    }
  }
}

这是df2的输出,不应该有NA值

> with(df2,head(df2[order(unit,event_time),],50))
      event unit          event_time fileEventIndex
7  Event  g    1 2014-01-18 05:39:10             NA
25 Event  g    1 2014-03-25 01:56:28             NA
38 Event  g    1 2014-04-29 09:57:39             NA
42 Event  j    1 2014-05-17 05:39:30             NA
43 Event  g    1 2014-05-23 05:07:06             NA
46 Event  g    1 2014-06-03 07:12:13             NA
53 Event  i    1 2014-06-25 21:51:25             NA
54 Event  h    1 2014-06-30 00:41:00             NA
64 Event  f    1 2014-08-05 06:28:56             NA
2  Event  f    2 2014-01-03 03:27:28              0
12 Event  h    2 2014-02-01 08:52:08             NA
27 Event  i    2 2014-03-25 22:36:06             NA
39 Event  f    2 2014-05-02 07:00:18             NA
44 Event  f    2 2014-05-24 09:41:48             NA
47 Event  j    2 2014-06-04 22:45:07             NA
50 Event  g    2 2014-06-08 20:25:46             NA
58 Event  j    2 2014-07-19 05:03:48             NA
67 Event  h    2 2014-08-10 05:00:55             NA
22 Event  h    3 2014-03-15 20:25:16              0

1 个答案:

答案 0 :(得分:4)

执行此操作的一个不错的方法是在您的primaryEntireFileEvents条目上使用data.table滚动连接到主表。

library(data.table)

dt1<-data.table(df1)
dt2<-data.table(df2)
rankedEntireFileEvents.table <-data.table(rankedEntireFileEvents)

setkey(dt1,unit,event_time)
setkey(dt2,unit,event_time)
setkey(rankedEntireFileEvents.table, unit, event_time)

dt1.ranked <- rankedEntireFileEvents.table[dt1, roll=TRUE]
#some cleaning up to get your desired result
dt1.ranked$Event.Name <-NULL
setnames(dt1.ranked,"i.Event.Name","Event.Name")

#NA's in fileEventIndex indicte they precede fileEventIndex 1 

dt2.ranked <- rankedEntireFileEvents.table[dt2, roll=TRUE]

现在表dt1.ranked和dt2.ranked是你想要的输出。