我正在尝试合并两个dataframes
,其中共同点是时间。但是,两者之间的时间记录可能不同。我希望按时间合并这两个,但缓冲间隔为30分钟。
dataframes
在概念上设置如下:Data_cam <- data.frame(Start_haul=c(("31-10-2015 07:13:00"),("31-10-2015 22:40:00"),("01-11-2015 06:48:00"),("01-11-2015 16:13:00")),
VesselID=c('XBBX','XBBX','XAAX','XAAX'),
Species=("TOR"), Discard=c(0.28,0.96,2.92,0))
Data_sif <- data.frame(Start_haul=c(("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 23:05:00"),("31-10-2015 23:05:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 16:11:00")), VesselID=c('XBBX','XBBX','XBBX','XBBX','XBBX','XAAX','XAAX','XAAX','XAAX'),Species=("TOR"), Size_class=c("1","2","3","4","5","1","2","4","5"), Landing_kg=c(10.5,20.5,5.6,400,2,120,250,10.3,2.1))
这意味着Data_sif中的三个第一行与Data_cam中的第一行匹配,我希望将Data_cam中第一行的列#34; Discard&#34; -value添加到Data_sif中的第三行。 同样,Data_sif中的第4行和第5行与Data_cam中的第二行匹配,我想添加&#34; Discard&#34;所有行都在这里等等。 &#34; Discard&#34; -column中的值应重复显示在&#34; Size_class&#34; -column中显示的公共时间戳的每个值。
所需的输出看起来像这样
Data_combined <- data.frame(Start_haul=c(("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 23:05:00"),("31-10-2015 23:05:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 16:11:00")), VesselID=c('XBBX','XBBX','XBBX','XBBX','XBBX','XAAX','XAAX','XAAX','XAAX'),Species=("TOR"), Size_class=c("1","2","3","4","5","1","2","4","5"), Landing_kg=c(10.5,20.5,5.6,400,2,120,250,10.3,2.1),
Discard=c(0.28,0.28,0.28,0.96,0.96,2.92,2.92,2.92,0))
我想在最终实现中添加更多列,包括位置数据,但为了简单起见,我想从合并Discard-column开始。
我已经尝试了旧帖子但未能为我拥有的数据实现它。
答案 0 :(得分:1)
以下是lubridate
和dplyr
的解决方案。它有点繁琐,但它确实有效:
library(lubridate)
library(dplyr)
Data_cam <- data.frame(Start_haul=c(("31-10-2015 07:13:00"),("31-10-2015 22:40:00"),("01-11-2015 06:48:00"),("01-11-2015 16:13:00")),
VesselID=c('XBBX','XBBX','XAAX','XAAX'),
Species=("TOR"), Discard=c(0.28,0.96,2.92,0))
Data_sif <- data.frame(Start_haul=c(("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 23:05:00"),("31-10-2015 23:05:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 16:11:00")),
VesselID=c('XBBX','XBBX','XBBX','XBBX','XBBX','XAAX','XAAX','XAAX','XAAX'),Species=("TOR"), Size_class=c("1","2","3","4","5","1","2","4","5"),
Landing_kg=c(10.5,20.5,5.6,400,2,120,250,10.3,2.1))
Data_sif %>%left_join(., Data_cam, by = "VesselID",suffix=c('_sif','_cam')) %>% mutate(buff1 = dmy_hms(Start_haul_cam) - minutes(30)) %>%
mutate(buff2 = dmy_hms(Start_haul_cam) + minutes(30)) %>%
filter(dmy_hms(Start_haul_sif) >= buff1 & dmy_hms(Start_haul_sif) <= buff2) %>%
select(-contains('_cam')) %>% select(-contains('buff'))
# Start_haul_sif VesselID Species_sif Size_class Landing_kg Discard
# 1 31-10-2015 07:05:00 XBBX TOR 1 10.5 0.28
# 2 31-10-2015 07:05:00 XBBX TOR 2 20.5 0.28
# 3 31-10-2015 07:05:00 XBBX TOR 3 5.6 0.28
# 4 31-10-2015 23:05:00 XBBX TOR 4 400.0 0.96
# 5 31-10-2015 23:05:00 XBBX TOR 5 2.0 0.96
# 6 01-11-2015 06:28:00 XAAX TOR 1 120.0 2.92
# 7 01-11-2015 06:28:00 XAAX TOR 2 250.0 2.92
# 8 01-11-2015 06:28:00 XAAX TOR 4 10.3 2.92
# 9 01-11-2015 16:11:00 XAAX TOR 5 2.1 0.00
修改强>
或稍微瘦下来:
Data_sif %>%
left_join(., Data_cam, by = "VesselID",suffix=c('_sif','_cam')) %>%
filter(dmy_hms(Start_haul_sif) >= dmy_hms(Start_haul_cam) - minutes(30) &
dmy_hms(Start_haul_sif) <= dmy_hms(Start_haul_cam) + minutes(30)) %>%
select(-contains('_cam'))
答案 1 :(得分:1)
使用sqldf
可以实现一个解决方案。
library(sqldf)
# First convert Start_haul to Date/time
Data_cam$Start_haul <- as.POSIXct(Data_cam$Start_haul,
format = "%d-%m-%Y %H:%M:%S")
Data_sif$Start_haul <- as.POSIXct(Data_sif$Start_haul,
format = "%d-%m-%Y %H:%M:%S")
# The absolute difference between Start_haul is considered as less than
# 30*60 (1800 seconds) for joining.
sqldf("SELECT Data_sif.Start_haul, Data_sif.VesselID, Data_sif.Species,
Data_sif.Size_class, Data_sif.Landing_kg, Data_cam.Discard
FROM Data_sif, Data_cam
WHERE Data_sif.VesselID = Data_cam.VesselID AND
Data_sif.Species = Data_cam.Species AND
abs(Data_sif.Start_haul - Data_cam.Start_haul) <= 30*60
")
# Result
# Start_haul VesselID Species Size_class Landing_kg Discard
#1 31-10-2015 07:05:00 XBBX TOR 1 10.5 0.28
#2 31-10-2015 07:05:00 XBBX TOR 2 20.5 0.28
#3 31-10-2015 07:05:00 XBBX TOR 3 5.6 0.28
#4 31-10-2015 23:05:00 XBBX TOR 4 400.0 0.96
#5 31-10-2015 23:05:00 XBBX TOR 5 2.0 0.96
#6 01-11-2015 06:28:00 XAAX TOR 1 120.0 2.92
#7 01-11-2015 06:28:00 XAAX TOR 2 250.0 2.92
#8 01-11-2015 06:28:00 XAAX TOR 4 10.3 2.92
#9 01-11-2015 16:11:00 XAAX TOR 5 2.1 0.00
数据强>
Data_cam <- data.frame(Start_haul=c(("31-10-2015 07:13:00"),("31-10-2015 22:40:00"),("01-11-2015 06:48:00"),("01-11-2015 16:13:00")),
VesselID=c('XBBX','XBBX','XAAX','XAAX'),
Species=("TOR"), Discard=c(0.28,0.96,2.92,0))
Data_sif <- data.frame(Start_haul=c(("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 07:05:00"),("31-10-2015 23:05:00"),("31-10-2015 23:05:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 06:28:00"),("01-11-2015 16:11:00")), VesselID=c('XBBX','XBBX','XBBX','XBBX','XBBX','XAAX','XAAX','XAAX','XAAX'),Species=("TOR"), Size_class=c("1","2","3","4","5","1","2","4","5"), Landing_kg=c(10.5,20.5,5.6,400,2,120,250,10.3,2.1))
答案 2 :(得分:0)
您可能需要考虑使用data.table中的非equi连接,如下所示:
library(data.table)
setDT(Data_cam)
setDT(Data_sif)
#convert to POSIX datetime and create the 30mins buffer before and after Start_haul
Data_cam[, Start_haul := as.POSIXct(Start_haul, format="%d-%m-%Y %H:%M:%S")][,
c("BufferStart", "BufferEnd") := .(Start_haul - 30*60, Start_haul + 30*60)]
Data_sif[, Start_haul := as.POSIXct(Start_haul, format="%d-%m-%Y %H:%M:%S")]
#look up the Discard column using non-equi join from data.table package
Data_sif[Data_cam, Discard:=Discard,
on=.(VesselID, Species, Start_haul >= Start_haul, Start_haul <= BufferEnd)]