df1(下面)是一个事件日志。变量1由(非唯一)时间戳(POSIXCt)组成。变量2:4由事件(因子)的属性组成。
我创建了df2和df3来定义时间段。 df2存储初始时间,df3存储每个时间段的结束时间。
问题是如何使用变量名df2(与df3相同)扩展df1,同时为每个事件填充TRUE或FALSE,具体取决于事件是否属于该变量的其中一个时间区。登记/> 换句话说,如果事件属于时间仓(由df2和df3定义),则值为TRUE,否则为FALSE。 df1中的每个事件都需要针对所有时间区间(df2和3的所有元素对),一次一个变量(df2和3)进行检查。
由于存在大量变量和事件,我无法以交互方式执行此操作。 但是想学习如何用R方式做,避免显式循环,并利用向量化。
DATA(小型采样数据集)
df1 <- data.frame(time.stamp = c("2015-01-05 15:00:00", "2015-01-05 15:01:00", "2015-01-05 15:02:00", "2015-01-05 15:02:00", "2015-01-05 15:03:00", "2015-01-05 15:03:00", "2015-01-05 15:03:00", "2015-01-05 15:03:00"),
g.id = as.factor(c("848", "737", "848", "848", "737", "848", "737", "737"))
)
df1$time.stamp <- as.POSIXct(strptime(df1$time.stamp, "%Y-%m-%d %H:%M:%S"))
df2 <- data.frame(m0p1 = c("2015-01-05 15:00:00", "2015-01-05 16:00:00", "2015-01-05 17:00:00"),
m1p1 = c("2015-01-05 15:01:00", "2015-01-05 16:01:00", "2015-01-05 17:01:00"),
m2p1 = c("2015-01-05 15:02:00", "2015-01-05 16:02:00", "2015-01-05 17:02:00"),
m3p1 = c("2015-01-05 15:03:00", "2015-01-05 16:03:00", "2015-01-05 17:03:00")
)
df2$m0p1 <- as.POSIXct(strptime(df2$m0p1, "%Y-%m-%d %H:%M:%S"))
df2$m1p1 <- as.POSIXct(strptime(df2$m1p1, "%Y-%m-%d %H:%M:%S"))
df2$m2p1 <- as.POSIXct(strptime(df2$m2p1, "%Y-%m-%d %H:%M:%S"))
df2$m3p1 <- as.POSIXct(strptime(df2$m3p1, "%Y-%m-%d %H:%M:%S"))
df3 <- data.frame(m0p1 = c("2015-01-05 15:01:00", "2015-01-05 16:01:00", "2015-01-05 17:01:00"),
m1p1 = c("2015-01-05 15:02:00", "2015-01-05 16:02:00", "2015-01-05 17:02:00"),
m2p1 = c("2015-01-05 15:03:00", "2015-01-05 16:03:00", "2015-01-05 17:03:00"),
m3p1 = c("2015-01-05 15:04:00", "2015-01-05 16:04:00", "2015-01-05 17:04:00")
)
df3$m0p1 <- as.POSIXct(strptime(df3$m0p1, "%Y-%m-%d %H:%M:%S"))
df3$m1p1 <- as.POSIXct(strptime(df3$m1p1, "%Y-%m-%d %H:%M:%S"))
df3$m2p1 <- as.POSIXct(strptime(df3$m2p1, "%Y-%m-%d %H:%M:%S"))
df3$m3p1 <- as.POSIXct(strptime(df3$m3p1, "%Y-%m-%d %H:%M:%S"))
RESULT 结果将是这样的:
> head(df1.extended)
time.stamp g.id m0p1 m1p1 m2p1 m3p1
1 2015-01-05 15:00:00 848 TRUE FALSE FALSE FALSE
2 2015-01-05 15:01:00 737 FALSE TRUE FALSE FALSE
3 2015-01-05 15:02:00 848 FALSE FALSE TRUE FALSE
4 2015-01-05 15:02:00 848 FALSE FALSE TRUE FALSE
5 2015-01-05 15:03:00 737 FALSE FALSE FALSE TRUE
6 2015-01-05 15:03:00 848 FALSE FALSE FALSE TRUE
7 2015-01-05 15:03:00 737 FALSE FALSE FALSE TRUE
8 2015-01-05 15:03:00 848 FALSE FALSE FALSE TRUE
非常感谢任何指针。感谢
答案 0 :(得分:3)
您可以使用数据包data.table中的foverlaps
:
library(reshape2)
df2 <- melt(df2, value.name = "start")
df3 <- melt(df3, value.name = "end")
df2$end <- df3$end
library(data.table)
setDT(df1)
setDT(df2)
df1[, time.stamp2 := time.stamp]
setkey(df2, start, end)
res <- df2[, foverlaps(df1, .SD,
by.x = c("time.stamp", "time.stamp2"),
by.y = c("start", "end"),
type = "start")[,list(time.stamp, g.id, match = !is.na(start))],
by = variable]
res[, id := seq_len(.N), by = variable]
dcast(res, id + time.stamp + g.id ~ variable, value.var = "match")
# id time.stamp g.id m0p1 m1p1 m2p1 m3p1
# 1 1 2015-01-05 15:00:00 848 TRUE FALSE FALSE FALSE
# 2 2 2015-01-05 15:01:00 737 FALSE TRUE FALSE FALSE
# 3 3 2015-01-05 15:02:00 848 FALSE FALSE TRUE FALSE
# 4 4 2015-01-05 15:02:00 848 FALSE FALSE TRUE FALSE
# 5 5 2015-01-05 15:03:00 737 FALSE FALSE FALSE TRUE
# 6 6 2015-01-05 15:03:00 848 FALSE FALSE FALSE TRUE
# 7 7 2015-01-05 15:03:00 737 FALSE FALSE FALSE TRUE
# 8 8 2015-01-05 15:03:00 737 FALSE FALSE FALSE TRUE