我正在研究一个由旅行组成的数据集(变量:user_id, purpose, start_timeh, end_timeh
。每次观察应该是一次旅行;但是由于数据错误,大约一半的旅行被分成了旅行段。在单次旅行的多次观察中(例如,旅行#3被分成5段/具有不同旅行ID的观察)。
> head(df2)
trip_id user_id duration_min purpose start_timeh end_timeh fix_needed
1 151203 abc 10.0 7 19:07 19:16 1
2 151204 abc 1.5 7 19:16 19:18 1
3 151206 abc 1.0 7 20:59 21:03 1
4 151207 abc 3.0 7 21:03 21:05 1
5 151208 abc 5.5 7 21:05 21:10 1
6 151210 abc 4.5 2 21:18 21:25 0
>
> dput(head(df2,4))
structure(list(trip_id = c(151203L, 151204L, 151206L, 151207L
), user_id = structure(c(1L, 1L, 1L, 1L), .Label = "abc", class = "factor"),
duration_min = c(10, 1.5, 1, 3), purpose = c(7L, 7L, 7L,
7L), start_timeh = structure(c(27L, 29L, 37L, 39L), .Label = c("0:08",
"15:50", "15:53", "15:55", "16:01", "16:10", "16:35", "17:04",
"17:08", "17:14", "17:25", "17:28", "17:32", "17:34", "17:48",
"17:54", "18:14", "18:17", "18:19", "18:28", "18:41", "18:44",
"18:47", "18:50", "18:54", "18:56", "19:07", "19:08", "19:16",
"19:18", "19:19", "19:23", "19:30", "19:59", "2:12", "2:25",
"20:59", "21:00", "21:03", "21:05", "21:18"), class = "factor"),
end_timeh = structure(c(28L, 29L, 36L, 37L), .Label = c("0:49",
"15:50", "15:53", "15:55", "16:01", "16:04", "16:12", "16:57",
"17:06", "17:08", "17:25", "17:32", "17:34", "17:52", "17:55",
"17:56", "18:14", "18:16", "18:19", "18:30", "18:44", "18:47",
"18:50", "18:54", "18:56", "18:58", "19:10", "19:16", "19:18",
"19:19", "19:27", "19:28", "19:32", "2:17", "20:06", "21:03",
"21:05", "21:10", "21:25", "21:39", "3:05"), class = "factor"),
fix_needed = c(1L, 1L, 1L, 1L)), .Names = c("trip_id", "user_id",
"duration_min", "purpose", "start_timeh", "end_timeh", "fix_needed"
), row.names = c(NA, 4L), class = "data.frame")
我想根据三个标准组合这些旅行段:
最终结果应如下所示:
所以我跑了这个:
v = vector('numeric')
for (i in 2:nrow(df)) {
if (as.numeric(df$start_timeh[i]) == as.numeric(df$end_timeh[i-1]) && df$user_id[i] == df$user_id[i-1] && df$purpose[i] == df$purpose[i-1])
{
df$duration_min[i]<-df$duration_min[i]+df$duration_min[i-1]
v <- c(v,i-1)
}
}
df <- df[-v,]
然而,结果并不像我预期的那样。很多细分都没有删除。
更新:创建此问题的时间戳出错。代码是正确的。