比较数据帧以提取特定值

时间:2018-08-04 16:09:22

标签: r date merge posixct which

我有两个数据框:

df <- data.frame(Group = c("A","B","C","D","E","F"),
             Date = c("2018-04-12 08:56:00","2018-04-13 11:03:00","2018-04-14 14:30:00","2018-04-15 03:10:00","2018-04-16 07:28:00","2018-04-17 11:17:00"))

df2 <- data.frame(Group = c("A","A","B","B","C","C","C","D","D","E","E","F","F"),
              Date = c("2018-04-12 08:56:00","2018-04-12 10:42:00","2018-04-13 10:03:00","2018-04-13 11:21:00","2018-04-14 08:17:00","2018-04-14 10:32:00","2018-04-14 22:44:00","2018-04-15 03:10:00","2018-04-15 11:17:00","2018-04-16 16:56:00","2018-04-16 20:01:00","2018-04-17 11:15:00","2018-04-17 11:20:00"))

我想做两件事。首先,按组,我想将df中的Date列与df2中的Date列进行比较,并提取完全匹配的dateDate,或者,如果没有完全匹配,则从df2中提取最接近且在其之前的DateDate df中的日期。

第二,按组,我想将df中的Date列与df2中的Date列进行比较,并在存在完全匹配项的情况下提取Date;如果没有完全匹配项,则从最接近的df2中提取Date,不管它是否在df中的Date之前。

因此,此示例的结果应类似于以下内容:

result <- data.frame(Group = c("A","B","C","D","E","F"),
                 Date = c("2018-04-12 08:56:00","2018-04-13 11:03:00","2018-04-14 14:30:00","2018-04-15 03:10:00","2018-04-16 07:28:00","2018-04-17 11:17:00"),
                 Return1 = c("2018-04-12 08:56:00","2018-04-13 10:03:00","2018-04-14 10:32:00","2018-04-15 03:10:00",NA,"2018-04-17 11:15:00"),
                 Return2 = c("2018-04-12 08:56:00","2018-04-13 11:21:00","2018-04-14 10:32:00","2018-04-15 03:10:00","2018-04-16 16:56:00","2018-04-17 11:15:00"))

2 个答案:

答案 0 :(得分:1)

这就是我想找的东西。

library(dplyr)
library(purrr)
library(lubridate)
library(data.table)

df <- df %>% mutate(Date = parse_date_time(Date, orders = "ymd HMS"))
df2 <- df2 %>% mutate(Date = parse_date_time(Date, orders = "ymd HMS")) %>% mutate(Result1 = Date)
df3 <- df2 %>% rename(Result2 = Result1)

setDT(df)
setDT(df2)
setDT(df3)

setkey(df,Group, Date)
setkey(df2,Group, Date)
setkey(df3,Group, Date)

list(df2[df, roll = Inf], df3[df, roll = "nearest"]) %>% 
    reduce(full_join, by = c("Group", "Date"))

#   Group                Date             Result1             Result2
# 1     A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
# 2     B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
# 3     C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
# 4     D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
# 5     E 2018-04-16 07:28:00                <NA> 2018-04-16 16:56:00
# 6     F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00

答案 1 :(得分:1)

这是仅使用基数R的可能解决方案:

# convert the dates from string to POSIXct
d1 <- as.POSIXct(df$Date)
d2 <- as.POSIXct(df2$Date)
# create a matrix m[df rows, df2 rows] with the difference between 
# the dates (df$Date - df2Date); where df row group != df2 row group set NA
m <- outer(1:nrow(df),1:nrow(df2),
           function(i,j){ ifelse(df$Group[i]!=df2$Group[j],NA,d1[i]-d2[j]) } )

# copy df into res
res <- df
# compute Return1 using matrix m 
# (for each row choose the first having minimum difference>= 0)
res$Return1 <- apply(m,1,function(r){o=order(r);df2$Date[o[r[o]>=0][1]]})
# compute Return2 using matrix m 
# (for each row choose the first having the min absolute difference)
res$Return2 <- apply(m,1,function(r)df2$Date[order(abs(r))[1]])

> res
  Group                Date             Return1             Return2
1     A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
2     B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
3     C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
4     D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
5     E 2018-04-16 07:28:00                <NA> 2018-04-16 16:56:00
6     F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00

这是另一种可能的解决方案(始终使用base R),可能更有效:

# convert the dates from string to POSIXct
d1 <- as.POSIXct(df$Date)
d2 <- as.POSIXct(df2$Date)
# split the row-indexes of df2 into a list of indexes by Group
df2splits <- split(1:nrow(df2),df2$Group)
# for each row of df, save the index of df2split list corresponding to the same Group
splitIdxs <- match(df$Group,names(df2splits))
# compute Return1 using sapply and the previously created structures
res$Return1 <- sapply(1:nrow(df),
               function(i){
                 idx <- df2splits[[splitIdxs[i]]]
                 differ <- d1[i] - d2[idx]
                 o=order(differ)
                 df2$Date[idx][o[differ[o]>=0][1]]
               })
# compute Return2 using sapply and the previously created structures
res$Return2 <- sapply(1:nrow(df),
               function(i){
                 idx <- df2splits[[splitIdxs[i]]]
                 differ <- d1[i] - d2[idx]
                 df2$Date[idx][order(abs(differ))[1]]
               })

> res
  Group                Date             Return1             Return2
1     A 2018-04-12 08:56:00 2018-04-12 08:56:00 2018-04-12 08:56:00
2     B 2018-04-13 11:03:00 2018-04-13 10:03:00 2018-04-13 11:21:00
3     C 2018-04-14 14:30:00 2018-04-14 10:32:00 2018-04-14 10:32:00
4     D 2018-04-15 03:10:00 2018-04-15 03:10:00 2018-04-15 03:10:00
5     E 2018-04-16 07:28:00                <NA> 2018-04-16 16:56:00
6     F 2018-04-17 11:17:00 2018-04-17 11:15:00 2018-04-17 11:15:00