我正在尝试合并两个不同表的两个表数据集(1)监视数据,以及(2)程序时间表。
目标是使诸如在观看时间间隔内观看的所有节目的输出在同一行中列出。 预期产量
Person Channel program_Date start_time end_time Prog1 Prog2 Prog3
Name A NatGeo 1/1/2018 11:00 12:00 Doc A Doc B Doc C
Name B NatGeo 1/1/2018 12:30 14:00 Doc C Doc D -NA-
Name B HBO 1/1/2018 21:30 22:00 Mov A -NA- -NA-
Name B HBO 1/1/2018 22:30 23:30 Mov A Mov B -NA-
以下示例数据正是R Studio中的命令和错误输出以及dput()
中的两个数据集。我仍在学习R,我很难弄清错误的实际含义。而且我查看了R- merging two data sets within time duration/intervals,并尝试完全按照该过程进行操作,但是它一直在给出错误,并且仍然不确定为什么time1中的列与时间2中的列不对应。谢谢
dt1和dt2的样本数据。当两条线同时运行时,将出现错误。我试图修复,但仍然出错:
> setkey(dt2_schedule, Channel, time1, time2)
dt <- foverlaps(dt1_watching, dt2_schedule, by.x = c("Channel", "start", "end"), nomatch = 0L)
Error in foverlaps(dt1_watching, dt2_schedule, by.x = c("Channel", "start", :
All entries in column time1 should be <= corresponding entries in column time2 in data.table 'y'
dt1_watching <- structure(list(Person = c("name1", "name2", "name3", "name1",
"name2"), Channel = c("FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD"
), start = structure(c(1522564740, 1522566240, 1522566540, 1522654080,
1522655760), class = c("POSIXct", "POSIXt"), tzone = ""), end = structure(c(1522566000,
1522567920, 1522567560, 1522656060, 1522658100), class = c("POSIXct",
"POSIXt"), tzone = "")), row.names = c(NA, -5L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000009000788>, .Names = c("Person",
"Channel", "start", "end"))
dt2_schedule <- structure(list(Channel = c("FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD"), Program = c("NIGHT WATCH", "EXISTS",
"MISSIONARY MAN", "NATURAL BORN KILLERS", "TANK 432", "EXTRATERRESTRIAL",
"ENTRAPMENT", "GARM WARS: THE LAST DRUID", "STRAW DOGS", "VICE",
"INSURGENT", "LILA & EVE", "KILLING SALAZAR", "HACKER", "STRAW DOGS",
"LOOSE CANNONS", "THE LAZARUS EFFECT", "SHARKTOPUS VS. PTERACUDA",
"GARM WARS: THE LAST DRUID", "EXISTS", "MAN VS.", "TANK 432",
"LILA & EVE", "MISSIONARY MAN", "HACKER", "MAN ON FIRE", "A TIME TO KILL",
"I HAD A BLOODY GOOD TIME AT HOUSE HARKER", "INSURGENT", "THE NEWTON BOYS"
), time1 = structure(c(1522555200, 1522561200, 1522562700, 1522568400,
1522575300, 1522580700, 1522586700, 1522593600, 1522599600, 1522606200,
1522611900, 1522619100, 1522624800, 1522630800, 1522637400, 1522644000,
1522649700, 1522654800, 1522660200, 1522666200, 1522671000, 1522676100,
1522681500, 1522687200, 1522692900, 1522699500, 1522708200, 1522717200,
1522722300, 1522729500), class = c("POSIXct", "POSIXt"), tzone = ""),
time2 = structure(c(1522561200, 1522562700, 1522568400, 1522575300,
1522580700, 1522586700, 1522593600, 1522599600, 1522606200,
1522611900, 1522619100, 1522624800, 1522630800, 1522637400,
1522557600, 1522649700, 1522654800, 1522660200, 1522666200,
1522671000, 1522676100, 1522681500, 1522687200, 1522692900,
1522699500, 1522708200, 1522717200, 1522722300, 1522643100,
1522729500), class = c("POSIXct", "POSIXt"), tzone = "")), row.names = c(NA,
-30L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x0000000009000788>, .Names = c("Channel",
"Program", "time1", "time2"), sorted = c("Channel", "time1",
"time2"))
未修改的数据
> dput(dt1_watching)
structure(list(V1 = c("name1", "name2", "name3", "name1", "name2"
), V2 = c("FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD"), V3 = c("4/1/2018",
"4/1/2018", "4/1/2018", "4/2/2018", "4/2/2018"), V4 = c("2:39:00",
"3:04:00", "3:09:00", "3:28:00", "3:56:00"), V5 = c("3:00:00",
"3:32:00", "3:26:00", "4:01:00", "4:35:00")), .Names = c("V1",
"V2", "V3", "V4", "V5"), row.names = c(NA, -5L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000009000788>)
> dput(dt2_schedule)
structure(list(V1 = c("FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD"), V2 = c("NIGHT WATCH", "EXISTS", "MISSIONARY MAN",
"NATURAL BORN KILLERS", "TANK 432", "EXTRATERRESTRIAL", "ENTRAPMENT",
"GARM WARS: THE LAST DRUID", "STRAW DOGS", "VICE", "INSURGENT",
"LILA & EVE", "KILLING SALAZAR", "HACKER", "STRAW DOGS", "LOOSE CANNONS",
"THE LAZARUS EFFECT", "SHARKTOPUS VS. PTERACUDA", "GARM WARS: THE LAST DRUID",
"EXISTS", "MAN VS.", "TANK 432", "LILA & EVE", "MISSIONARY MAN",
"HACKER", "MAN ON FIRE", "A TIME TO KILL", "I HAD A BLOODY GOOD TIME AT HOUSE HARKER",
"INSURGENT", "THE NEWTON BOYS"), V3 = c("4/1/2018", "4/1/2018",
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018",
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018",
"4/1/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018",
"4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018",
"4/2/2018", "4/2/2018", "4/2/2018", "4/3/2018"), V4 = c("0:00:00",
"1:40:00", "2:05:00", "3:40:00", "5:35:00", "7:05:00", "8:45:00",
"10:40:00", "12:20:00", "14:10:00", "15:45:00", "17:45:00", "19:20:00",
"21:00:00", "22:50:00", "0:40:00", "2:15:00", "3:40:00", "5:10:00",
"6:50:00", "8:10:00", "9:35:00", "11:05:00", "12:40:00", "14:15:00",
"16:05:00", "18:30:00", "21:00:00", "22:25:00", "0:25:00")), .Names = c("V1",
"V2", "V3", "V4"), row.names = c(NA, -30L), class = c("data.table",
"data.frame"), .internal.selfref = <pointer: 0x0000000009000788>)
答案 0 :(得分:1)
好吗?
library(data.table)
# Add column names
names(dt1_watching) <- c("Person", "Channel", "program_Date", "start_time", "end_time")
names(dt2_schedule) <- c("Channel", "Program", "program_Date", "start_time")
# Convert date & time to POSIXct
# Note that foverlap requires a start and end date, so we create an end date
# from the next start date per channel using shift for df1
dt1_watching[, `:=`(
start = as.POSIXct(paste(program_Date, start_time), format = "%m/%d/%Y %H:%M"),
end = as.POSIXct(paste(program_Date, end_time), format = "%m/%d/%Y %H:%M"))]
dt2_schedule[,
time1 := as.POSIXct(paste(program_Date, start_time), format = "%m/%d/%Y %H:%M")][,
time2 := shift(time1, 1, type = "lead", fill = max(time1)), by = Channel]
# Remove unnecessary columns in preparation for final output
dt1_watching[, `:=`(program_Date = NULL, start_time = NULL, end_time = NULL)]
dt2_schedule[, `:=`(program_Date = NULL, start_time = NULL)]
# Join on channel and overlapping intervals
# Once joined, remove time1 and time2
setkey(dt2_schedule, Channel, time1, time2)
dt <- foverlaps(dt1_watching, dt2_schedule, by.x = c("Channel", "start", "end"), nomatch = 0L)
dt[, `:=`(time1 = NULL, time2 = NULL)]
# Spread long to wide
dt[, idx := paste0("Prog",1:.N), by = c("Channel", "Person", "start")]
dcast(dt, Channel + Person + start + end ~ idx, value.var = "Program")[order(Person, start)]
# Channel Person start end
#1: FOX Action Movies HD name1 2018-04-01 02:39:00 2018-04-01 03:00:00
#2: FOX Action Movies HD name1 2018-04-02 03:28:00 2018-04-02 04:01:00
#3: FOX Action Movies HD name2 2018-04-01 03:04:00 2018-04-01 03:32:00
#4: FOX Action Movies HD name2 2018-04-02 03:56:00 2018-04-02 04:35:00
#5: FOX Action Movies HD name3 2018-04-01 03:09:00 2018-04-01 03:26:00
# Prog1 Prog2
#1: MISSIONARY MAN NA
#2: THE LAZARUS EFFECT SHARKTOPUS VS. PTERACUDA
#3: MISSIONARY MAN NA
#4: SHARKTOPUS VS. PTERACUDA NA
#5: MISSIONARY MAN NA
与我以前的答案唯一的区别是df2_schedule
的结束时间的计算。 请注意,您只有一个Channel
,因此对于多个渠道,您需要在Channel
之前执行此操作(因此,上面的by = Channel
)。
dt1_watching <- structure(list(V1 = c("name1", "name2", "name3", "name1", "name2"
), V2 = c("FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD"), V3 = c("4/1/2018",
"4/1/2018", "4/1/2018", "4/2/2018", "4/2/2018"), V4 = c("2:39:00",
"3:04:00", "3:09:00", "3:28:00", "3:56:00"), V5 = c("3:00:00",
"3:32:00", "3:26:00", "4:01:00", "4:35:00")), .Names = c("V1",
"V2", "V3", "V4", "V5"), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
dt2_schedule <- structure(list(V1 = c("FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD", "FOX Action Movies HD", "FOX Action Movies HD",
"FOX Action Movies HD"), V2 = c("NIGHT WATCH", "EXISTS", "MISSIONARY MAN",
"NATURAL BORN KILLERS", "TANK 432", "EXTRATERRESTRIAL", "ENTRAPMENT",
"GARM WARS: THE LAST DRUID", "STRAW DOGS", "VICE", "INSURGENT",
"LILA & EVE", "KILLING SALAZAR", "HACKER", "STRAW DOGS", "LOOSE CANNONS",
"THE LAZARUS EFFECT", "SHARKTOPUS VS. PTERACUDA", "GARM WARS: THE LAST DRUID",
"EXISTS", "MAN VS.", "TANK 432", "LILA & EVE", "MISSIONARY MAN",
"HACKER", "MAN ON FIRE", "A TIME TO KILL", "I HAD A BLOODY GOOD TIME AT HOUSE HARKER",
"INSURGENT", "THE NEWTON BOYS"), V3 = c("4/1/2018", "4/1/2018",
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018",
"4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018", "4/1/2018",
"4/1/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018",
"4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018", "4/2/2018",
"4/2/2018", "4/2/2018", "4/2/2018", "4/3/2018"), V4 = c("0:00:00",
"1:40:00", "2:05:00", "3:40:00", "5:35:00", "7:05:00", "8:45:00",
"10:40:00", "12:20:00", "14:10:00", "15:45:00", "17:45:00", "19:20:00",
"21:00:00", "22:50:00", "0:40:00", "2:15:00", "3:40:00", "5:10:00",
"6:50:00", "8:10:00", "9:35:00", "11:05:00", "12:40:00", "14:15:00",
"16:05:00", "18:30:00", "21:00:00", "22:25:00", "0:25:00")), .Names = c("V1",
"V2", "V3", "V4"), row.names = c(NA, -30L), class = c("data.table",
"data.frame"))