以下是最后一列所需列的样本数据
data<-structure(list(engagement_date = structure(c(16939, 16939, 16939,
16939, 16939, 16939, 16939, 16939), class = "Date"), driver_id = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "69", class = "factor"),
session_id = structure(1:8, .Label = c("16525506", "16526272",
"16527063", "16531156", "16532064", "16533490", "16541432",
"16547653", "16548040", "16553477", "16558000"), class = "factor"),
status = structure(c(3L, 2L, 3L, 4L, 1L, 3L, 1L, 2L), .Label = c("3",
"4", "6", "7"), class = "factor"), req_made_time = structure(c(1463556140,
1463556681, 1463557268, 1463560083, 1463560796, 1463562026,
1463568316, 1463572256), class = c("POSIXct", "POSIXt"), tzone = ""),
ride_drop_time = structure(c(NA, NA, NA, NA, 1463561749,
NA, 1463569532, NA), class = c("POSIXct", "POSIXt"), tzone = ""),
cmplt_flag = structure(c(1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L), .Label = c("0",
"1"), class = "factor"), req_no = 1:8, last_req_diff = c(0,
9, 9.8, 46.9, 11.9, 20.5, 104.8, 65.7), last_ride_diff = c(720,
729, 738.8, 785.7, 797.6, 4.6, 109.4, 45.4)), .Names = c("engagement_date",
"driver_id", "session_id", "status", "req_made_time", "ride_drop_time",
"cmplt_flag", "req_no", "last_req_diff", "last_ride_diff"), row.names = c(NA,
8L), class = "data.frame")
样本数据集中的最后一列是所需的列,已通过子集上的excel公式实现。我也可以使用下面的代码获取最后一列,但由于数据量很大,所以需要花费无尽的时间。下面的代码将帮助您了解各种条件
data1<-as.data.frame(data1)
len<-length(data1$driver_id)+1
seq<-1
while (seq<len){
data1$last_ride_diff[seq]<-ifelse(data1$req_no[seq]>1,ifelse(data1$cmplt_flag[(seq-1)]==1,as.numeric(difftime(data1$req_made_time[seq],data1$ride_drop_time[(seq-1)],"mins")),last_ride_diff[(seq-1)]+last_req_diff[seq]),720)}
请建议使用更快的方法获取所需值的方法可能是使用data.table或任何其他替代方法。由于我在数据集中有很多driver_ids,我需要为每个driver_id
获得所需的结果答案 0 :(得分:0)
这是一种可行的方法。
有用的zoo
函数需要包na.locf
(扩展值以填充NA,最新的非NA是任意方向)
library(data.table)
library(zoo)
dataT=as.data.table(data)[,-length(data),with=FALSE] # ensure data.table and remove wanted column
dataT[,drop_time_fill:=na.locf(ride_drop_time,na.rm=FALSE),by=driver_id]
dataout=dataT[,
.(ride_drop_time,
drop_time_fill,
last_req_dif=
ifelse(is.na(req_made_time - shift(req_made_time)),0,req_made_time - shift(req_made_time)),
last_ride_diff = req_made_time - shift(drop_time_fill,1)
)
,by=driver_id]
dataout[
is.na(dataout$last_ride_diff),
last_ride_diff:=720+cumsum(last_req_diff[is.na(dataout$last_ride_diff)]),by=driver_id]
dataout
driver_id ride_drop_time drop_time_fill last_req_dif last_ride_diff
1: 69 <NA> <NA> 0.000000 720.000000 mins
2: 69 <NA> <NA> 9.016667 729.000000 mins
3: 69 <NA> <NA> 9.783333 738.800000 mins
4: 69 <NA> <NA> 46.916667 785.700000 mins
5: 69 2016-05-18 10:55:49 2016-05-18 10:55:49 11.883333 797.600000 mins
6: 69 <NA> 2016-05-18 10:55:49 20.500000 4.616667 mins
7: 69 2016-05-18 13:05:32 2016-05-18 13:05:32 104.833333 109.450000 mins
8: 69 <NA> 2016-05-18 13:05:32 65.666667 45.400000 mins
我花了一些时间来弄清楚,但这是一个有趣的问题。 注意假设只有一条记录last_req_dif = 0(在开头猜测)
我无法测试整个数据,所以由您决定。