Question

我将以下数据作为跨越一个月的POSIXct时间列表。他们每个人代表自行车交付。我的目标是在24小时内找到每十分钟间隔的平均自行车交付量（总共产生144行）。首先，所有行程都需要求和并分成一个区间，然后除以天数。到目前为止，我已经设法编写了一个代码，该代码总计每10分钟间隔的行程，但它会产生不正确的值。我不确定它出了什么问题。

数据如下所示：

head(start_times)
[1] "2014-10-21 16:58:13 EST" "2014-10-07 10:14:22 EST" "2014-10-20 01:45:11 EST"
[4] "2014-10-17 08:16:17 EST" "2014-10-07 17:46:36 EST" "2014-10-28 17:32:34 EST"
length(start_times)
[1] 1747

代码如下所示：

library(lubridate)
library(dplyr)

tripduration <- floor(runif(1747) * 1000)

time_bucket <- start_times - minutes(minute(start_times) %% 10) - seconds(second(start_times))

df <- data.frame(tripduration, start_times, time_bucket)
summarized <- df %>%
group_by(time_bucket) %>%
summarize(trip_count = n())
summarized <- as.data.frame(summarized)
out_buckets <- data.frame(out_buckets = seq(as.POSIXlt("2014-10-01 00:00:00"), as.POSIXct("2014-10-31 23:0:00"), by = 600))
out <- left_join(out_buckets, summarized, by = c("out_buckets" = "time_bucket"))
out$trip_count[is.na(out$trip_count)] <- 0



head(out)
          out_buckets trip_count
1 2014-10-01 00:00:00          0
2 2014-10-01 00:10:00          0
3 2014-10-01 00:20:00          0
4 2014-10-01 00:30:00          0
5 2014-10-01 00:40:00          0
6 2014-10-01 00:50:00          0
dim(out)
[1] 4459    2

test <- format(out$out_buckets,"%H:%M:%S")
test2 <- out$trip_count
test <- cbind(test, test2)
colnames(test)[1] <- "interval"
colnames(test)[2] <- "count"
test <- as.data.frame(test)
test$count <- as.numeric(test$count) 
test <- aggregate(count~interval, test, sum)


head(test, n = 20)
   interval count
1  00:00:00    32
2  00:10:00    33
3  00:20:00    32
4  00:30:00    31
5  00:40:00    34
6  00:50:00    34
7  01:00:00    31
8  01:10:00    33
9  01:20:00    39
10 01:30:00    41
11 01:40:00    36
12 01:50:00    31
13 02:00:00    33
14 02:10:00    34
15 02:20:00    32
16 02:30:00    32
17 02:40:00    36
18 02:50:00    32
19 03:00:00    34
20 03:10:00    39



but this is impossible because when I sum the counts


sum(test$count)
[1] 7494

我得到7494而数字应该是1747

我不确定我哪里出错了，以及如何简化此代码以获得相同的结果。

Answer 1

我已尽我所能，但如果没有您的数据，我无法重现您的问题。

library(dplyr)

我创建了10分钟块的完整序列：

blocks.of.10mins <- data.frame(out_buckets=seq(as.POSIXct("2014/10/01 00:00"), by="10 mins", length.out=30*24*6))

然后将start_times拆分为相同的箱子。注意：我创建了一个午夜基线时间，强制块对齐10分钟。稍后删除它是读者的练习。我还更改了您的一个数据点，以便在同一个bin中至少有一个多个记录的示例。

start_times <- as.POSIXct(c("2014-10-01 00:00:00", ## added
                            "2014-10-21 16:58:13",
                            "2014-10-07 10:14:22",
                            "2014-10-20 01:45:11",
                            "2014-10-17 08:16:17",
                            "2014-10-07 10:16:36", ## modified
                            "2014-10-28 17:32:34"))

trip_times <- data.frame(start_times) %>% 
    mutate(out_buckets = as.POSIXct(cut(start_times, breaks="10 mins")))

然后可以合并start_times和所有10分钟的间隔

trips_merged <- merge(trip_times, blocks.of.10mins, by="out_buckets", all=TRUE)

然后可以将这些分组10分钟并计算

trips_merged %>% filter(!is.na(start_times)) %>% 
  group_by(out_buckets) %>% 
  summarise(trip_count=n())

Source: local data frame [6 x 2]

          out_buckets trip_count
               (time)      (int)
1 2014-10-01 00:00:00          1
2 2014-10-07 10:10:00          2
3 2014-10-17 08:10:00          1
4 2014-10-20 01:40:00          1
5 2014-10-21 16:50:00          1
6 2014-10-28 17:30:00          1

相反，如果我们只考虑时间，而不是日期

trips_merged2 <- trips_merged
trips_merged2$out_buckets <- format(trips_merged2$out_buckets, "%H:%M:%S")

trips_merged2 %>% filter(!is.na(start_times)) %>% 
  group_by(out_buckets) %>% 
  summarise(trip_count=n())

Source: local data frame [6 x 2]

  out_buckets trip_count
        (chr)      (int)
1    00:00:00          1
2    01:40:00          1
3    08:10:00          1
4    10:10:00          2
5    16:50:00          1
6    17:30:00          1

按时间间隔总结不起作用

1 个答案: