我有一个数据表只包含一系列次数。我有另一个包含两列的数据表:start_time和end_time。我想获取第一个数据表并添加一列,其中值是第二个数据表中所有行的计数,其中第一个数据表的时间适合开始和结束时间。这是我的代码
start_date <- as.POSIXct(x = "2017-01-31 17:00:00", format = "%Y-%m-%d %H:%M:%S")
end_date <- as.POSIXct(x = "2017-02-01 09:00:00", format = "%Y-%m-%d %H:%M:%S")
all_dates <- as.data.table(seq(start_date, end_date, "min"))
colnames(all_dates) <- c("Bin")
start_times <- sample(seq(start_date,end_date,"min"), 100)
offsets <- sample(seq(60,7200,60), 100)
end_times <- start_times + offsets
input_data <- data.table(start_times, end_times)
这是我想要做的,但这是错误的,并给出了错误。写这个的正确方法是什么?
all_dates[, BinCount := input_data[start_times < Bin & end_times > Bin, .N] ]
最后我应该得到像
这样的东西Bin BinCount
2017-01-31 17:00:00 1
2017-01-31 17:01:00 5
...
答案 0 :(得分:3)
使用sqldf
可以非常轻松地解决问题,因为它提供了使用范围检查连接表的简便方法。因此,一种解决方案可能是:
The data from OP:
library(data.table)
start_date <- as.POSIXct(x = "2017-01-31 17:00:00", format = "%Y-%m-%d %H:%M:%S")
end_date <- as.POSIXct(x = "2017-02-01 09:00:00", format = "%Y-%m-%d %H:%M:%S")
all_dates <- as.data.table(seq(start_date, end_date, "min"))
colnames(all_dates) <- c("Bin")
start_times <- sample(seq(start_date,end_date,"min"), 100)
offsets <- sample(seq(60,7200,60), 100)
end_times <- start_times + offsets
input_data <- data.table(start_times, end_times)
library(sqldf)
result <- sqldf("SELECT all_dates.bin, count() as BinCount
FROM all_dates, input_data
WHERE all_dates.bin > input_data.start_times AND
all_dates.bin < input_data.end_times
GROUP BY bin" )
result
Bin BinCount
1 2017-01-31 17:01:00 1
2 2017-01-31 17:02:00 1
3 2017-01-31 17:03:00 1
4 2017-01-31 17:04:00 1
5 2017-01-31 17:05:00 1
6 2017-01-31 17:06:00 1
...........
...........
497 2017-02-01 01:17:00 6
498 2017-02-01 01:18:00 5
499 2017-02-01 01:19:00 5
500 2017-02-01 01:20:00 4
[ reached getOption("max.print") -- omitted 460 rows ]
答案 1 :(得分:2)
在data.table
中,您正在进行范围加入。
library(data.table)
start_date <- as.POSIXct(x = "2017-01-31 17:00:00", format = "%Y-%m-%d %H:%M:%S")
end_date <- as.POSIXct(x = "2017-02-01 09:00:00", format = "%Y-%m-%d %H:%M:%S")
all_dates <- as.data.table(seq(start_date, end_date, "min"))
colnames(all_dates) <- c("Bin")
set.seed(123)
start_times <- sample(seq(start_date,end_date,"min"), 100)
offsets <- sample(seq(60,7200,60), 100)
end_times <- start_times + offsets
input_data <- data.table(start_times, end_times)
## doing the range-join and calculating the number of items per bin in one chained step
input_data[
all_dates
, on = .(start_times < Bin, end_times > Bin)
, nomatch = 0
, allow.cartesian = T
][, .N, by = start_times]
# start_times N
# 1: 2017-01-31 17:01:00 1
# 2: 2017-01-31 17:02:00 1
# 3: 2017-01-31 17:03:00 1
# 4: 2017-01-31 17:04:00 1
# 5: 2017-01-31 17:05:00 1
# ---
# 956: 2017-02-01 08:56:00 6
# 957: 2017-02-01 08:57:00 4
# 958: 2017-02-01 08:58:00 4
# 959: 2017-02-01 08:59:00 5
# 960: 2017-02-01 09:00:00 5
注意:
all_dates
对象放在联接的右侧,因此结果包含input_data
列的名称,即使它们是您的Bins(请参阅{{ 3}}关于这个主题的讨论)set.seed()
,因为你正在采样答案 2 :(得分:1)
未请求,但这是使用tidyverse
的紧凑替代解决方案。使用lubridate
解析器interval
和%within%
以及purrr::map_int
生成所需的bin计数。
library(tidyverse)
library(lubridate)
start_date <- ymd_hms(x = "2017-01-31 17:00:00") # lubridate parsers
end_date <- ymd_hms(x = "2017-02-01 09:00:00")
all_dates <- tibble(seq(start_date, end_date, "min")) # tibble swap for data.table
colnames(all_dates) <- c("Bin")
start_times <- sample(seq(start_date,end_date,"min"), 100)
offsets <- sample(seq(60,7200,60), 100)
end_times <- start_times + offsets
input_data <- tibble(
start_times,
end_times,
intvl = interval(start_times, end_times) # Add interval column
)
all_dates %>% # Checks date in Bin and counts intervals it lies within
mutate(BinCount = map_int(.$Bin, ~ sum(. %within% input_data$intvl)))
# A tibble: 961 x 2
Bin BinCount
<dttm> <int>
1 2017-01-31 17:00:00 0
2 2017-01-31 17:01:00 0
3 2017-01-31 17:02:00 0
4 2017-01-31 17:03:00 0
5 2017-01-31 17:04:00 0
6 2017-01-31 17:05:00 0
7 2017-01-31 17:06:00 0
8 2017-01-31 17:07:00 1
9 2017-01-31 17:08:00 1
10 2017-01-31 17:09:00 1
# ... with 951 more rows