我有两个数据框:
df <- data.frame(Group = c("A","B","C","D","E","F"),
Date = c("2018-04-12 08:56:00","2018-04-13 11:03:00","2018-04-14 14:30:00","2018-04-15 03:10:00","2018-04-16 07:28:00","2018-04-17 11:17:00"))
df2 <- data.frame(Group = c("A","A","B","B","C","C","C","D","D","E","E","F","F"),
Date = c("2018-04-12 08:56:00","2018-04-12 10:42:00","2018-04-13 10:03:00","2018-04-13 11:21:00","2018-04-14 08:17:00","2018-04-14 10:32:00","2018-04-14 22:44:00","2018-04-15 03:10:00","2018-04-15 11:17:00","2018-04-16 16:56:00","2018-04-16 20:01:00","2018-04-17 11:15:00","2018-04-17 11:20:00"))
我想做两件事。首先,按组,我想将df中的Date列与df2中的Date列进行比较,并提取完全匹配的dateDate,或者,如果没有完全匹配,则从df2中提取最接近且在其之前的DateDate df中的日期。
第二,按组,我想将df中的Date列与df2中的Date列进行比较,并在存在完全匹配项的情况下提取Date;如果没有完全匹配项,则从最接近的df2中提取Date,不管它是否在df中的Date之前。
因此,此示例的结果应类似于以下内容:
result <- data.frame(Group = c("A","B","C","D","E","F"),
Date = c("2018-04-12 08:56:00","2018-04-13 11:03:00","2018-04-14 14:30:00","2018-04-15 03:10:00","2018-04-16 07:28:00","2018-04-17 11:17:00"),
Return1 = c("2018-04-12 08:56:00","2018-04-13 10:03:00","2018-04-14 10:32:00","2018-04-15 03:10:00",NA,"2018-04-17 11:15:00"),
Return2 = c("2018-04-12 08:56:00","2018-04-13 11:21:00","2018-04-14 10:32:00","2018-04-15 03:10:00","2018-04-16 16:56:00","2018-04-17 11:15:00"))
答案 0 :(得分:1)
这就是我想找的东西。
library(dplyr)
library(purrr)
library(lubridate)
library(data.table)
df <- df %>% mutate(Date = parse_date_time(Date, orders = "ymd HMS"))
df2 <- df2 %>% mutate(Date = parse_date_time(Date, orders = "ymd HMS")) %>% mutate(Result1 = Date)
df3 <- df2 %>% rename(Result2 = Result1)
setDT(df)
setDT(df2)
setDT(df3)
setkey(df,Group, Date)
setkey(df2,Group, Date)
setkey(df3,Group, Date)
list(df2[df, roll = Inf], df3[df, roll = "nearest"]) %>%
reduce(full_join, by = c("Group", "Date"))
# Group Date Result1 Result2
# 1 A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
# 2 B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
# 3 C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
# 4 D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
# 5 E 2018-04-16 07:28:00 <NA> 2018-04-16 16:56:00
# 6 F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00
答案 1 :(得分:1)
这是仅使用基数R的可能解决方案:
# convert the dates from string to POSIXct
d1 <- as.POSIXct(df$Date)
d2 <- as.POSIXct(df2$Date)
# create a matrix m[df rows, df2 rows] with the difference between
# the dates (df$Date - df2Date); where df row group != df2 row group set NA
m <- outer(1:nrow(df),1:nrow(df2),
function(i,j){ ifelse(df$Group[i]!=df2$Group[j],NA,d1[i]-d2[j]) } )
# copy df into res
res <- df
# compute Return1 using matrix m
# (for each row choose the first having minimum difference>= 0)
res$Return1 <- apply(m,1,function(r){o=order(r);df2$Date[o[r[o]>=0][1]]})
# compute Return2 using matrix m
# (for each row choose the first having the min absolute difference)
res$Return2 <- apply(m,1,function(r)df2$Date[order(abs(r))[1]])
> res
Group Date Return1 Return2
1 A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
2 B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
3 C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
4 D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
5 E 2018-04-16 07:28:00 <NA> 2018-04-16 16:56:00
6 F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00
这是另一种可能的解决方案(始终使用base R),可能更有效:
# convert the dates from string to POSIXct
d1 <- as.POSIXct(df$Date)
d2 <- as.POSIXct(df2$Date)
# split the row-indexes of df2 into a list of indexes by Group
df2splits <- split(1:nrow(df2),df2$Group)
# for each row of df, save the index of df2split list corresponding to the same Group
splitIdxs <- match(df$Group,names(df2splits))
# compute Return1 using sapply and the previously created structures
res$Return1 <- sapply(1:nrow(df),
function(i){
idx <- df2splits[[splitIdxs[i]]]
differ <- d1[i] - d2[idx]
o=order(differ)
df2$Date[idx][o[differ[o]>=0][1]]
})
# compute Return2 using sapply and the previously created structures
res$Return2 <- sapply(1:nrow(df),
function(i){
idx <- df2splits[[splitIdxs[i]]]
differ <- d1[i] - d2[idx]
df2$Date[idx][order(abs(differ))[1]]
})
> res
Group Date Return1 Return2
1 A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
2 B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
3 C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
4 D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
5 E 2018-04-16 07:28:00 <NA> 2018-04-16 16:56:00
6 F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00