我正在尝试使用自行车站数据,并且有一个for循环,用于提取从不同站点开始的自行车,然后重新排列停止时间和startime以显示运营商的自行车运动(从它停止的地方,到它开始的地方),difftime
或它开始和最后结束之间的时间差。
示例数据
starttime stoptime start.station.id end.station.id bikeid
1 2017-01-16 13:08:18 2017-01-16 13:28:13 3156 466 1
2 2017-01-10 19:10:31 2017-01-10 19:16:02 422 3090 1
3 2017-01-04 08:47:42 2017-01-04 08:57:10 507 442 1
4 2017-01-12 18:08:33 2017-01-12 18:36:09 546 3151 2
5 2017-01-21 09:52:13 2017-01-21 10:21:07 3243 212 2
6 2017-01-26 05:46:18 2017-01-26 05:49:13 470 168 2
我的代码
raw_data = test
unique_id = unique(raw_data$bikeid)
output1 <- data.frame("bikeid"= integer(0), "end.station.id"= integer(0), "start.station.id" = integer(0), "diff.time" = numeric(0), "stoptime" = character(),"starttime" = character(), stringsAsFactors=FALSE)
for (bikeid in unique_id)
{
onebike <- raw_data[ which(raw_data$bikeid== bikeid), ]
onebike$starttime <- strptime(onebike$starttime, "%Y-%m-%d %H:%M:%S", tz = "EST")
onebike <- onebike[order(onebike$starttime, decreasing = FALSE),]
if(nrow(onebike) >=2 ){
for(i in 2:nrow(onebike )) {
print(onebike)
if(is.integer(onebike[i-1,"end.station.id"]) & is.integer(onebike[i,"start.station.id"]) &
onebike[i-1,"end.station.id"] != onebike[i,"start.station.id"]){
diff_time <- as.double(difftime(strptime(onebike[i,"starttime"], "%Y-%m-%d %H:%M:%S", tz = "EST"),
strptime(onebike[i-1,"stoptime"], "%Y-%m-%d %H:%M:%S", tz = "EST")
,units = "secs"))
new_row <- c(bikeid, onebike[i-1,"end.station.id"], onebike[i,"start.station.id"], diff_time, as.character(onebike[i-1,"stoptime"]), as.character(onebike[i,"starttime"]))
output1[nrow(output1) + 1,] = new_row
}
}
}
}
输出
bikeid end.station.id start.station.id diff.time stoptime starttime
1 1 442 422 555201 2017-01-04 08:57:10 2017-01-10 19:10:31
2 1 3090 3156 496336 2017-01-10 19:16:02 2017-01-16 13:08:18
3 2 3151 3243 746164 2017-01-12 18:36:09 2017-01-21 09:52:13
4 2 212 470 415511 2017-01-21 10:21:07 2017-01-26 05:46:18
5 3 3112 351 1587161 2017-01-12 08:58:42 2017-01-30 17:51:23
然而,在大型数据集上,这个for循环需要很长时间。有没有办法让dplyr
或data.table
加速循环或以避免循环的方式重新排列数据?希望得到任何解释或建议
示例数据(在输入中)
structure(list(starttime = structure(c(1484572098, 1484075431,
1483519662, 1484244513, 1484992333, 1485409578, 1484210616, 1483727948,
1485798683), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
stoptime = structure(c(1484573293, 1484075762, 1483520230,
1484246169, 1484994067, 1485409753, 1484211522, 1483729024,
1485799997), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
start.station.id = c(3156L, 422L, 507L, 546L, 3243L, 470L,
439L, 309L, 351L), end.station.id = c(466L, 3090L, 442L,
3151L, 212L, 168L, 3112L, 439L, 433L), bikeid = c(1, 1, 1,
2, 2, 2, 3, 3, 3)), .Names = c("starttime", "stoptime", "start.station.id",
"end.station.id", "bikeid"), row.names = c(NA, -9L), class = "data.frame")
答案 0 :(得分:3)
一种方法如下。我打电话给你的数据foo。您可能希望按bikeid
和starttime
开始对数据进行排序。然后,对于每个bikeid
,您希望使用next.start.station.id
创建新列(即next.start.time
和lead()
)。您还想使用difftime()
找到时差。之后,您要删除end.station.id
和next.start.station.id
具有相同ID的行。最后,您可以根据需要排列列。
library(dplyr)
foo %>%
arrange(bikeid, starttime) %>% # if necessary, arrange(bikeid, starttime, stoptime)
group_by(bikeid) %>%
mutate(next.start.station.id = lead(start.station.id),
next.start.time = lead(starttime),
diff.time = difftime(next.start.time, stoptime, units = "secs")) %>%
filter(end.station.id != next.start.station.id) %>%
select(bikeid, end.station.id, next.start.station.id, diff.time, stoptime, next.start.time)
bikeid end.station.id next.start.station.id diff.time stoptime next.start.time
<dbl> <int> <int> <time> <dttm> <dttm>
1 1.00 442 422 555201 2017-01-04 08:57:10 2017-01-10 19:10:31
2 1.00 3090 3156 496336 2017-01-10 19:16:02 2017-01-16 13:08:18
3 2.00 3151 3243 746164 2017-01-12 18:36:09 2017-01-21 09:52:13
4 2.00 212 470 415511 2017-01-21 10:21:07 2017-01-26 05:46:18
5 3.00 3112 351 1587161 2017-01-12 08:58:42 2017-01-30 17:51:23