R函数用于计算变量上的表达式并创建用逻辑填充的新变量

时间:2015-03-28 13:23:05

标签: r

df1(下面)是一个事件日志。变量1由(非唯一)时间戳(POSIXCt)组成。变量2:4由事件(因子)的属性组成。

我创建了df2和df3来定义时间段。 df2存储初始时间,df3存储每个时间段的结束时间。

问题是如何使用变量名df2(与df3相同)扩展df1,同时为每个事件填充TRUE或FALSE,具体取决于事件是否属于该变量的其中一个时间区。登记/> 换句话说,如果事件属于时间仓(由df2和df3定义),则值为TRUE,否则为FALSE。 df1中的每个事件都需要针对所有时间区间(df2和3的所有元素对),一次一个变量(df2和3)进行检查。

由于存在大量变量和事件,我无法以交互方式执行此操作。 但是想学习如何用R方式做,避免显式循环,并利用向量化。

DATA(小型采样数据集)

df1 <- data.frame(time.stamp = c("2015-01-05 15:00:00", "2015-01-05 15:01:00", "2015-01-05 15:02:00", "2015-01-05 15:02:00", "2015-01-05 15:03:00", "2015-01-05 15:03:00", "2015-01-05 15:03:00", "2015-01-05 15:03:00"),
                  g.id = as.factor(c("848", "737", "848", "848", "737", "848", "737", "737"))
              )
df1$time.stamp <- as.POSIXct(strptime(df1$time.stamp, "%Y-%m-%d %H:%M:%S"))                 

df2 <- data.frame(m0p1 = c("2015-01-05 15:00:00", "2015-01-05 16:00:00", "2015-01-05 17:00:00"),
              m1p1 = c("2015-01-05 15:01:00", "2015-01-05 16:01:00", "2015-01-05 17:01:00"),
              m2p1 = c("2015-01-05 15:02:00", "2015-01-05 16:02:00", "2015-01-05 17:02:00"),
              m3p1 = c("2015-01-05 15:03:00", "2015-01-05 16:03:00", "2015-01-05 17:03:00")
              )             
df2$m0p1 <- as.POSIXct(strptime(df2$m0p1, "%Y-%m-%d %H:%M:%S"))
df2$m1p1 <- as.POSIXct(strptime(df2$m1p1, "%Y-%m-%d %H:%M:%S")) 
df2$m2p1 <- as.POSIXct(strptime(df2$m2p1, "%Y-%m-%d %H:%M:%S"))
df2$m3p1 <- as.POSIXct(strptime(df2$m3p1, "%Y-%m-%d %H:%M:%S"))

df3 <- data.frame(m0p1 = c("2015-01-05 15:01:00", "2015-01-05 16:01:00", "2015-01-05 17:01:00"),
                 m1p1 = c("2015-01-05 15:02:00", "2015-01-05 16:02:00", "2015-01-05 17:02:00"),
                 m2p1 = c("2015-01-05 15:03:00", "2015-01-05 16:03:00", "2015-01-05 17:03:00"),
                 m3p1 = c("2015-01-05 15:04:00", "2015-01-05 16:04:00", "2015-01-05 17:04:00")
              )
df3$m0p1 <- as.POSIXct(strptime(df3$m0p1, "%Y-%m-%d %H:%M:%S"))  
df3$m1p1 <- as.POSIXct(strptime(df3$m1p1, "%Y-%m-%d %H:%M:%S")) 
df3$m2p1 <- as.POSIXct(strptime(df3$m2p1, "%Y-%m-%d %H:%M:%S"))
df3$m3p1 <- as.POSIXct(strptime(df3$m3p1, "%Y-%m-%d %H:%M:%S"))

RESULT 结果将是这样的:

> head(df1.extended)
          time.stamp  g.id  m0p1   m1p1   m2p1   m3p1
1 2015-01-05 15:00:00  848  TRUE   FALSE  FALSE  FALSE
2 2015-01-05 15:01:00  737  FALSE  TRUE   FALSE  FALSE 
3 2015-01-05 15:02:00  848  FALSE  FALSE  TRUE   FALSE
4 2015-01-05 15:02:00  848  FALSE  FALSE  TRUE   FALSE
5 2015-01-05 15:03:00  737  FALSE  FALSE  FALSE  TRUE
6 2015-01-05 15:03:00  848  FALSE  FALSE  FALSE  TRUE
7 2015-01-05 15:03:00  737  FALSE  FALSE  FALSE  TRUE
8 2015-01-05 15:03:00  848  FALSE  FALSE  FALSE  TRUE

非常感谢任何指针。感谢

1 个答案:

答案 0 :(得分:3)

您可以使用数据包data.table中的foverlaps

library(reshape2)
df2 <- melt(df2, value.name = "start")
df3 <- melt(df3, value.name = "end")
df2$end <- df3$end

library(data.table)
setDT(df1)
setDT(df2)

df1[, time.stamp2 := time.stamp]

setkey(df2, start, end)
res <- df2[, foverlaps(df1, .SD, 
                by.x = c("time.stamp", "time.stamp2"),
                by.y = c("start", "end"),
                type = "start")[,list(time.stamp, g.id, match = !is.na(start))], 
    by = variable]
res[, id := seq_len(.N), by = variable]

dcast(res, id + time.stamp + g.id ~ variable, value.var = "match")
#   id          time.stamp g.id  m0p1  m1p1  m2p1  m3p1
# 1  1 2015-01-05 15:00:00  848  TRUE FALSE FALSE FALSE
# 2  2 2015-01-05 15:01:00  737 FALSE  TRUE FALSE FALSE
# 3  3 2015-01-05 15:02:00  848 FALSE FALSE  TRUE FALSE
# 4  4 2015-01-05 15:02:00  848 FALSE FALSE  TRUE FALSE
# 5  5 2015-01-05 15:03:00  737 FALSE FALSE FALSE  TRUE
# 6  6 2015-01-05 15:03:00  848 FALSE FALSE FALSE  TRUE
# 7  7 2015-01-05 15:03:00  737 FALSE FALSE FALSE  TRUE
# 8  8 2015-01-05 15:03:00  737 FALSE FALSE FALSE  TRUE