R:使用来自其他列的滞后值创建新列& data.table的许多其他条件

时间:2016-07-05 10:05:07

标签: r data.table data-manipulation

以下是最后一列所需列的样本数据

data<-structure(list(engagement_date = structure(c(16939, 16939, 16939, 
                                            16939, 16939, 16939, 16939, 16939), class = "Date"), driver_id = structure(c(1L, 
                                                                                                                         1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "69", class = "factor"), 
              session_id = structure(1:8, .Label = c("16525506", "16526272", 
                                                     "16527063", "16531156", "16532064", "16533490", "16541432", 
                                                     "16547653", "16548040", "16553477", "16558000"), class = "factor"), 
              status = structure(c(3L, 2L, 3L, 4L, 1L, 3L, 1L, 2L), .Label = c("3", 
                                                                               "4", "6", "7"), class = "factor"), req_made_time = structure(c(1463556140, 
                                                                                                                                              1463556681, 1463557268, 1463560083, 1463560796, 1463562026, 
                                                                                                                                              1463568316, 1463572256), class = c("POSIXct", "POSIXt"), tzone = ""), 
              ride_drop_time = structure(c(NA, NA, NA, NA, 1463561749, 
                                           NA, 1463569532, NA), class = c("POSIXct", "POSIXt"), tzone = ""), 
              cmplt_flag = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("0", 
                                                                                   "1"), class = "factor"), req_no = 1:8, last_req_diff = c(0, 
                                                                                                                                            9, 9.8, 46.9, 11.9, 20.5, 104.8, 65.7), last_ride_diff = c(720, 
                                                                                                                                                                                                       729, 738.8, 785.7, 797.6, 4.6, 109.4, 45.4)), .Names = c("engagement_date", 
                                                                                                                                                                                                                                                                "driver_id", "session_id", "status", "req_made_time", "ride_drop_time", 
                                                                                                                                                                                                                                                                "cmplt_flag", "req_no", "last_req_diff", "last_ride_diff"), row.names = c(NA, 
                                                                                                                                                                                                                                                                                                                                          8L), class = "data.frame")

样本数据集中的最后一列是所需的列,已通过子集上的excel公式实现。我也可以使用下面的代码获取最后一列,但由于数据量很大,所以需要花费无尽的时间。下面的代码将帮助您了解各种条件

data1<-as.data.frame(data1)
len<-length(data1$driver_id)+1
seq<-1
while (seq<len){
data1$last_ride_diff[seq]<-ifelse(data1$req_no[seq]>1,ifelse(data1$cmplt_flag[(seq-1)]==1,as.numeric(difftime(data1$req_made_time[seq],data1$ride_drop_time[(seq-1)],"mins")),last_ride_diff[(seq-1)]+last_req_diff[seq]),720)}

请建议使用更快的方法获取所需值的方法可能是使用data.table或任何其他替代方法。由于我在数据集中有很多driver_ids,我需要为每个driver_id

获得所需的结果

1 个答案:

答案 0 :(得分:0)

这是一种可行的方法。 有用的zoo函数需要包na.locf(扩展值以填充NA,最新的非NA是任意方向)

library(data.table)
library(zoo)
dataT=as.data.table(data)[,-length(data),with=FALSE] # ensure data.table and remove wanted column
dataT[,drop_time_fill:=na.locf(ride_drop_time,na.rm=FALSE),by=driver_id]

dataout=dataT[,
.(ride_drop_time,
  drop_time_fill,
  last_req_dif=
ifelse(is.na(req_made_time - shift(req_made_time)),0,req_made_time - shift(req_made_time)),
  last_ride_diff =   req_made_time - shift(drop_time_fill,1)
  )
,by=driver_id]

dataout[
  is.na(dataout$last_ride_diff),
  last_ride_diff:=720+cumsum(last_req_diff[is.na(dataout$last_ride_diff)]),by=driver_id]

dataout

   driver_id      ride_drop_time      drop_time_fill last_req_dif  last_ride_diff
1:        69                <NA>                <NA>     0.000000 720.000000 mins
2:        69                <NA>                <NA>     9.016667 729.000000 mins
3:        69                <NA>                <NA>     9.783333 738.800000 mins
4:        69                <NA>                <NA>    46.916667 785.700000 mins
5:        69 2016-05-18 10:55:49 2016-05-18 10:55:49    11.883333 797.600000 mins
6:        69                <NA> 2016-05-18 10:55:49    20.500000   4.616667 mins
7:        69 2016-05-18 13:05:32 2016-05-18 13:05:32   104.833333 109.450000 mins
8:        69                <NA> 2016-05-18 13:05:32    65.666667  45.400000 mins

我花了一些时间来弄清楚,但这是一个有趣的问题。 注意假设只有一条记录last_req_dif = 0(在开头猜测)

我无法测试整个数据,所以由您决定。