我有两张桌子。 table1
看起来像这样
date hour data
2010-05-01 3 5
2010-05-02 7 7
2010-05-02 10 8
2010-07-03 18 3
2011-12-09 22 1
2012-05-01 3 0
它存储为data.table
,其密钥设置在date
和hour
上。
我有另一张桌子,看起来像这样。这是我的outages
表。
resource date_out date_back
joey 2010-04-30 4:00:00 2010-05-02 8:30:00
billy 2009-04-20 7:00:00 2009-02-02 5:30:00
bob 2011-11-15 12:20:00 2010-12-09 23:00:00
joey 2012-04-28 1:00:00 2012-05-02 17:00:00
我想将列添加到table1
,其中这些列是outages
表中的资源。我希望这些列中的值在没有中断时为0,在有时为1时为。
此示例的结果应为。
date hour data joey billy bob
2010-05-01 3 5 1 0 0
2010-05-02 7 7 1 0 0
2010-05-02 10 8 0 0 0
2010-07-03 18 3 0 0 0
2011-12-09 22 1 0 0 1
2012-05-01 3 0 1 0 0
实际上我的table1
有大约2500行,而我的outages
表有19000.我能想到的唯一方法是循环遍历outages
表的每一行,然后在正确的位置将{1}插入table1
。我的代码依赖于table1
的顺序,所以至少它不必为outages
的每一行扫描该表的100%。但是,我的数据下面需要4个小时。
for (out in 1:length(outages$resource)) {
a<-as.character(outages[out]$resource)
#if column doesn't exist then create it
if (a %in% colnames(table1)==FALSE) {
table1$new<-0
setnames(table1, "new", a)
}
midpoint<-round(length(table1$date)/2,0)
if (table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages [out]$due_back)
{
while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint-1
}
midpoint<-round(length(table1$date)/2,0)
while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint+1
}
} else {
if (table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back) {
while(table1$date[midpoint]+table1$hour[midpoint]*60*60>outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
midpoint<-midpoint-1
}
while(table1$date[midpoint]+table1$hour[midpoint]*60*60>=outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint-1
}
}
midpoint<-round(length(table1$date)/2,0)
if (table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out) {
while(table1$date[midpoint]+table1$hour[midpoint]*60*60<outages[out]$due_out && midpoint>=1 && midpoint<=length(table1$date)) {
midpoint<-midpoint+1
}
while(table1$date[midpoint]+table1$hour[midpoint]*60*60<=outages[out]$due_back && midpoint>=1 && midpoint<=length(table1$date)) {
table1[midpoint,a:=1,with=FALSE]
midpoint<-midpoint+1
}
}
}
if (sum(table1[,a,with=FALSE])==0) {
table1[,a:=NULL,with=FALSE]
}
}
引用每个人最喜欢的电视购物广告系列“必须有更好的方式”。
答案 0 :(得分:2)
这是实现你想要的方式。假设您的table1
时间精度为1小时。虽然它可以被修改为任意精度,但它会在更大的时间间隔内表现更好,因为它构造了date_out
- date_back
范围内可能时间的完整序列。注意,我使用与OP略有不同的表来说明重叠间隔并纠正OP中的一些错误。
table1 = data.table(date = c("2010-05-01", "2010-05-02", "2010-05-02", "2010-07-03", "2011-12-09", "2012-05-01"), hour = c(3,7,10,18,22,3), data = c(5,7,8,3,1,0))
outages = data.table(resource = c("joey", "bob", "billy", "bob", "joey"), date_out = c("2010-04-30 4:00:00", "2010-04-30 4:00:00", "2009-04-20 7:00:00", "2011-11-15 12:20:00", "2012-04-28 1:00:00"), date_back=c("2010-05-02 8:30:00", "2010-05-02 8:30:00", "2009-06-02 5:30:00", "2011-12-09 23:00:00", "2012-05-02 17:00:00"))
# round up date_out and round down date_back
# and create a sequence in-between spaced by 1 hour
outages[, list(datetime = seq(as.POSIXct(round(as.POSIXct(date_out) + 30*60-1, "hours")),
as.POSIXct(round(as.POSIXct(date_back) - 30*60, "hours")),
60*60)),
by = list(resource, date_out)] -> outages.expanded
setkey(outages.expanded, datetime)
# merge with the original table, then run "table" to get the frequencies/occurences
# and cbind back with the original table
cbind(table1, unclass(table(
outages.expanded[table1[, list(datetime=as.POSIXct(paste0(date, " ", hour, ":00:00")))],
resource])))
# date hour data bob joey
#1: 2010-05-01 3 5 1 1
#2: 2010-05-02 7 7 1 1
#3: 2010-05-02 10 8 0 0
#4: 2010-07-03 18 3 0 0
#5: 2011-12-09 22 1 1 0
#6: 2012-05-01 3 0 0 1