R-使用时间间隔错误使用Foverlaps联接表

时间:2018-07-19 03:19:41

标签: r data.table

我正在尝试合并两个不同表的两个表数据集(1)监视数据,以及(2)程序时间表。

目标是使诸如在观看时间间隔内观看的所有节目的输出在同一行中列出。 预期产量

Person    Channel   program_Date  start_time  end_time  Prog1  Prog2   Prog3
Name A    NatGeo      1/1/2018      11:00       12:00     Doc A  Doc B  Doc C       
Name B    NatGeo      1/1/2018      12:30       14:00     Doc C  Doc D  -NA- 
Name B    HBO         1/1/2018      21:30       22:00     Mov A  -NA-   -NA- 
Name B    HBO         1/1/2018      22:30       23:30     Mov A  Mov B  -NA-

以下示例数据正是R Studio中的命令和错误输出以及dput()中的两个数据集。我仍在学习R,我很难弄清错误的实际含义。而且我查看了R- merging two data sets within time duration/intervals,并尝试完全按照该过程进行操作,但是它一直在给出错误,并且仍然不确定为什么time1中的列与时间2中的列不对应。谢谢

dt1和dt2的样本数据。当两条线同时运行时,将出现错误。我试图修复,但仍然出错:

> setkey(dt2_schedule, Channel, time1, time2)
 dt <- foverlaps(dt1_watching, dt2_schedule, by.x = c("Channel", "start", "end"), nomatch = 0L)
Error in foverlaps(dt1_watching, dt2_schedule, by.x = c("Channel", "start",  : 
  All entries in column time1 should be <= corresponding entries in column time2 in data.table 'y'



dt1_watching <- structure(list(Person = c("name1", "name2", "name3", "name1", 
"name2"), Channel = c("FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD"
), start = structure(c(1522564740, 1522566240, 1522566540, 1522654080, 
1522655760), class = c("POSIXct", "POSIXt"), tzone = ""), end = structure(c(1522566000, 
1522567920, 1522567560, 1522656060, 1522658100), class = c("POSIXct", 
"POSIXt"), tzone = "")), row.names = c(NA, -5L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x0000000009000788>, .Names = c("Person", 
"Channel", "start", "end"))

dt2_schedule <- structure(list(Channel = c("FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD"), Program = c("NIGHT WATCH", "EXISTS", 
"MISSIONARY MAN", "NATURAL BORN KILLERS", "TANK 432", "EXTRATERRESTRIAL", 
"ENTRAPMENT", "GARM WARS: THE LAST DRUID", "STRAW DOGS", "VICE", 
"INSURGENT", "LILA & EVE", "KILLING SALAZAR", "HACKER", "STRAW DOGS", 
"LOOSE CANNONS", "THE LAZARUS EFFECT", "SHARKTOPUS VS. PTERACUDA", 
"GARM WARS: THE LAST DRUID", "EXISTS", "MAN VS.", "TANK 432", 
"LILA & EVE", "MISSIONARY MAN", "HACKER", "MAN ON FIRE", "A TIME TO KILL", 
"I HAD A BLOODY GOOD TIME AT HOUSE HARKER", "INSURGENT", "THE NEWTON BOYS"
), time1 = structure(c(1522555200, 1522561200, 1522562700, 1522568400, 
1522575300, 1522580700, 1522586700, 1522593600, 1522599600, 1522606200, 
1522611900, 1522619100, 1522624800, 1522630800, 1522637400, 1522644000, 
1522649700, 1522654800, 1522660200, 1522666200, 1522671000, 1522676100, 
1522681500, 1522687200, 1522692900, 1522699500, 1522708200, 1522717200, 
1522722300, 1522729500), class = c("POSIXct", "POSIXt"), tzone = ""), 
    time2 = structure(c(1522561200, 1522562700, 1522568400, 1522575300, 
    1522580700, 1522586700, 1522593600, 1522599600, 1522606200, 
    1522611900, 1522619100, 1522624800, 1522630800, 1522637400, 
    1522557600, 1522649700, 1522654800, 1522660200, 1522666200, 
    1522671000, 1522676100, 1522681500, 1522687200, 1522692900, 
    1522699500, 1522708200, 1522717200, 1522722300, 1522643100, 
    1522729500), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = c(NA, 
-30L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000009000788>, .Names = c("Channel", 
"Program", "time1", "time2"), sorted = c("Channel", "time1", 
"time2"))
  

未修改的数据

> dput(dt1_watching)
structure(list(V1 = c("name1", "name2", "name3", "name1", "name2"
), V2 = c("FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD"), V3 = c("4/1/2018", 
"4/1/2018", "4/1/2018", "4/2/2018", "4/2/2018"), V4 = c("2:39:00", 
"3:04:00", "3:09:00", "3:28:00", "3:56:00"), V5 = c("3:00:00", 
"3:32:00", "3:26:00", "4:01:00", "4:35:00")), .Names = c("V1", 
"V2", "V3", "V4", "V5"), row.names = c(NA, -5L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x0000000009000788>)
> dput(dt2_schedule)
structure(list(V1 = c("FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD", 
"FOX Action Movies HD"), V2 = c("NIGHT WATCH", "EXISTS", "MISSIONARY MAN", 
"NATURAL BORN KILLERS", "TANK 432", "EXTRATERRESTRIAL", "ENTRAPMENT", 
"GARM WARS: THE LAST DRUID", "STRAW DOGS", "VICE", "INSURGENT", 
"LILA & EVE", "KILLING SALAZAR", "HACKER", "STRAW DOGS", "LOOSE CANNONS", 
"THE LAZARUS EFFECT", "SHARKTOPUS VS. PTERACUDA", "GARM WARS: THE LAST DRUID", 
"EXISTS", "MAN VS.", "TANK 432", "LILA & EVE", "MISSIONARY MAN", 
"HACKER", "MAN ON FIRE", "A TIME TO KILL", "I HAD A BLOODY GOOD TIME AT HOUSE HARKER", 
"INSURGENT", "THE NEWTON BOYS"), V3 = c("4/1/2018", "4/1/2018", 
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", 
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", 
"4/1/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", 
"4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", 
"4/2/2018", "4/2/2018", "4/2/2018", "4/3/2018"), V4 = c("0:00:00", 
"1:40:00", "2:05:00", "3:40:00", "5:35:00", "7:05:00", "8:45:00", 
"10:40:00", "12:20:00", "14:10:00", "15:45:00", "17:45:00", "19:20:00", 
"21:00:00", "22:50:00", "0:40:00", "2:15:00", "3:40:00", "5:10:00", 
"6:50:00", "8:10:00", "9:35:00", "11:05:00", "12:40:00", "14:15:00", 
"16:05:00", "18:30:00", "21:00:00", "22:25:00", "0:25:00")), .Names = c("V1", 
"V2", "V3", "V4"), row.names = c(NA, -30L), class = c("data.table", 
"data.frame"), .internal.selfref = <pointer: 0x0000000009000788>)

1 个答案:

答案 0 :(得分:1)

好吗?

library(data.table)

# Add column names
names(dt1_watching) <- c("Person", "Channel", "program_Date", "start_time", "end_time")
names(dt2_schedule) <- c("Channel", "Program", "program_Date", "start_time")

# Convert date & time to POSIXct
# Note that foverlap requires a start and end date, so we create an end date
# from the next start date per channel using shift for df1
dt1_watching[, `:=`(
    start = as.POSIXct(paste(program_Date, start_time), format = "%m/%d/%Y %H:%M"),
    end = as.POSIXct(paste(program_Date, end_time), format = "%m/%d/%Y %H:%M"))]
dt2_schedule[,
    time1 := as.POSIXct(paste(program_Date, start_time), format = "%m/%d/%Y %H:%M")][,
    time2 := shift(time1, 1, type = "lead", fill = max(time1)), by = Channel]

# Remove unnecessary columns in preparation for final output
dt1_watching[, `:=`(program_Date = NULL, start_time = NULL, end_time = NULL)]
dt2_schedule[, `:=`(program_Date = NULL, start_time = NULL)]

# Join on channel and overlapping intervals
# Once joined, remove time1 and time2
setkey(dt2_schedule, Channel, time1, time2)
dt <- foverlaps(dt1_watching, dt2_schedule, by.x = c("Channel", "start", "end"), nomatch = 0L)
dt[, `:=`(time1 = NULL, time2 = NULL)]

# Spread long to wide
dt[, idx := paste0("Prog",1:.N), by = c("Channel", "Person", "start")]
dcast(dt, Channel + Person + start + end ~ idx, value.var = "Program")[order(Person, start)]
#                Channel Person               start                 end
#1: FOX Action Movies HD  name1 2018-04-01 02:39:00 2018-04-01 03:00:00
#2: FOX Action Movies HD  name1 2018-04-02 03:28:00 2018-04-02 04:01:00
#3: FOX Action Movies HD  name2 2018-04-01 03:04:00 2018-04-01 03:32:00
#4: FOX Action Movies HD  name2 2018-04-02 03:56:00 2018-04-02 04:35:00
#5: FOX Action Movies HD  name3 2018-04-01 03:09:00 2018-04-01 03:26:00
#                      Prog1                    Prog2
#1:           MISSIONARY MAN                       NA
#2:       THE LAZARUS EFFECT SHARKTOPUS VS. PTERACUDA
#3:           MISSIONARY MAN                       NA
#4: SHARKTOPUS VS. PTERACUDA                       NA
#5:           MISSIONARY MAN                       NA

与我以前的答案唯一的区别是df2_schedule的结束时间的计算。 请注意,您只有一个Channel,因此对于多个渠道,您需要在Channel之前执行此操作(因此,上面的by = Channel)。


样本数据

dt1_watching <- structure(list(V1 = c("name1", "name2", "name3", "name1", "name2"
), V2 = c("FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD"), V3 = c("4/1/2018",
"4/1/2018", "4/1/2018", "4/2/2018", "4/2/2018"), V4 = c("2:39:00",
"3:04:00", "3:09:00", "3:28:00", "3:56:00"), V5 = c("3:00:00",
"3:32:00", "3:26:00", "4:01:00", "4:35:00")), .Names = c("V1",
"V2", "V3", "V4", "V5"), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))


dt2_schedule <- structure(list(V1 = c("FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD"), V2 = c("NIGHT WATCH", "EXISTS", "MISSIONARY MAN",
"NATURAL BORN KILLERS", "TANK 432", "EXTRATERRESTRIAL", "ENTRAPMENT",
"GARM WARS: THE LAST DRUID", "STRAW DOGS", "VICE", "INSURGENT",
"LILA & EVE", "KILLING SALAZAR", "HACKER", "STRAW DOGS", "LOOSE CANNONS",
"THE LAZARUS EFFECT", "SHARKTOPUS VS. PTERACUDA", "GARM WARS: THE LAST DRUID",
"EXISTS", "MAN VS.", "TANK 432", "LILA & EVE", "MISSIONARY MAN",
"HACKER", "MAN ON FIRE", "A TIME TO KILL", "I HAD A BLOODY GOOD TIME AT HOUSE HARKER",
"INSURGENT", "THE NEWTON BOYS"), V3 = c("4/1/2018", "4/1/2018",
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018",
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018",
"4/1/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018",
"4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018",
"4/2/2018", "4/2/2018", "4/2/2018", "4/3/2018"), V4 = c("0:00:00",
"1:40:00", "2:05:00", "3:40:00", "5:35:00", "7:05:00", "8:45:00",
"10:40:00", "12:20:00", "14:10:00", "15:45:00", "17:45:00", "19:20:00",
"21:00:00", "22:50:00", "0:40:00", "2:15:00", "3:40:00", "5:10:00",
"6:50:00", "8:10:00", "9:35:00", "11:05:00", "12:40:00", "14:15:00",
"16:05:00", "18:30:00", "21:00:00", "22:25:00", "0:25:00")), .Names = c("V1",
"V2", "V3", "V4"), row.names = c(NA, -30L), class = c("data.table",
"data.frame"))