我陷入了这个问题,想不出任何简单的dplyr
来解决这个问题:
我有两个data.frames df1
和df2
。我想将time2
列的值从df2
“绑定”到df1
,但前提是user_id
和place_id
匹配:< / p>
> head(df1)
time1 user_id place_id
1 2018-06-09 12:56:12 sdkID1 place_ID1
2 2018-06-24 05:15:07 sdkID1 place_ID1
3 2018-06-12 04:15:21 sdkID1 place_ID10
4 2018-06-12 14:56:42 sdkID1 place_ID17
5 2018-05-16 18:21:51 sdkID1 place_ID20
6 2018-07-11 12:19:27 sdkID1 place_ID21
> head(df2)
time2 user_id place_id
1 2018-06-09 13:12:39 sdkID1 place_ID1
2 2018-06-24 06:52:51 sdkID1 place_ID1
3 2018-06-12 05:50:19 sdkID1 place_ID10
4 2018-05-16 19:42:59 sdkID1 place_ID20
5 2018-07-11 12:23:44 sdkID1 place_ID21
6 2018-06-13 11:56:05 sdkID1 place_ID34
但是,我没有任何id
变量来检查time2
中的df2
是否属于df1
。为了使事情变得有趣,对于某些事件,我没有任何时间戳可以匹配到df1
。
我想要类似的东西:
> head(result)
time1 user_id place_id time2
1 2018-06-09 12:56:12 sdkID1 place_ID1 2018-06-09 13:12:39
2 2018-06-24 05:15:07 sdkID1 place_ID1 2018-06-24 06:52:51
3 2018-06-12 04:15:21 sdkID1 place_ID10 2018-06-12 05:50:19
4 2018-06-12 14:56:42 sdkID1 place_ID17 NA
5 2018-05-16 18:21:51 sdkID1 place_ID20 2018-05-16 19:42:59
6 2018-07-11 12:19:27 sdkID1 place_ID21 2018-07-11 12:23:44
有没有办法采取
time2-time1
只保留行与 正时差?我知道有但是后来我有机会 像前两行一样,它们具有相同的user_id
和place_id
这样我得到2018-06-24 06:52:51
-2018-06-24 05:15:07
的结果 和2018-06-24 06:52:51
-2018-06-09 12:56:12
。我只需要第一个区别。想象time1是到达,time2是离开。基本上,我的问题归结为找出正在运行的火车或飞机。我需要某种方式来了解
2018-06-24 06:52:51
-2018-06-24 05:15:07
是相同的,并且2018-06-24 06:52:51
-2018-06-09 12:56:12
不是同一火车/飞机。
由于我想将代码转换为SQL
,因此解决方案必须基于dplyr
。我尝试过类似df1 %>% group_by(user_id,place_id)
的操作,但现在肯定会卡住。这是一些示例数据
set.seed(42)
u <- runif(1000, 0, 60) # "noise" to add or subtract from some timepoint
df1<-data.frame(time1=as.POSIXlt(sort(u)*100000, origin = "2018-05-03 08:00:00"),
user_id=sample(rep(paste0('sdkID',1:60)),1000,replace=TRUE),
place_id=sample(rep(paste0('place_ID',1:60)),1000,replace=TRUE))
df1=df1[order(df1$user_id,df1$place_id,df1$time1),]
df2=df1[-sample(1:1000,200),]
df2$time1<-df2$time1+u[-sample(1:1000,200)]*100
## cleaning up
colnames(df2)[1]='time2'
rownames(df1)=1:1000
rownames(df2)=1:800
答案 0 :(得分:1)
建议使用lubridate:在R中使用日期和时间来计算最小时差。
library(dplyr)
library(lubridate)
# Codes Given
set.seed(42)
u <- runif(1000, 0, 60) # "noise" to add or subtract from some timepoint
df1<-data.frame(time1=as.POSIXlt(sort(u)*100000, origin = "2018-05-03 08:00:00"),
user_id=sample(rep(paste0('sdkID',1:60)),1000,replace=TRUE),
place_id=sample(rep(paste0('place_ID',1:60)),1000,replace=TRUE))
df1=df1[order(df1$user_id,df1$place_id,df1$time1),]
df2=df1[-sample(1:1000,200),]
df2$time1<-df2$time1+u[-sample(1:1000,200)]*100
# dplyr operations
df_3 = df1 %>% left_join(df2, by = c('user_id', 'place_id'))
df_3$time_diff = abs(ymd_hms(df_3$time1.x) - ymd_hms(df_3$time1.y))
df_3 %>%
arrange(-desc(user_id), -desc(place_id), -desc(time_diff)) %>%
group_by(user_id, place_id) %>%
slice(which.min(time_diff))
其他资源:
答案 1 :(得分:1)
我相信以下内容可以解决您的问题。
library(dplyr)
result <- df1 %>%
left_join(df2, by = c("user_id", "place_id")) %>%
mutate(Diff = difftime(time1.y, time1.x, units = "secs"),
Diff = as.numeric(Diff)) %>%
filter(Diff > 0) %>%
arrange(user_id, place_id, time1.x) %>%
group_by(time1.x) %>%
mutate(time1 = first(time1.x), time2 = time1.y) %>%
ungroup() %>%
select(-Diff, -time1.x, -time1.y)
head(result)
## A tibble: 6 x 4
# user_id place_id time1 time2
# <fct> <fct> <dttm> <dttm>
#1 sdkID1 place_ID1 2018-05-14 06:53:01 2018-05-14 08:24:30
#2 sdkID1 place_ID18 2018-06-05 04:38:53 2018-06-05 06:12:35
#3 sdkID1 place_ID19 2018-05-22 19:20:40 2018-05-22 19:49:17
#4 sdkID1 place_ID25 2018-06-15 08:55:55 2018-06-15 10:18:58
#5 sdkID1 place_ID27 2018-05-06 17:34:40 2018-05-15 17:17:48
#6 sdkID1 place_ID27 2018-05-06 17:34:40 2018-06-11 15:14:07
答案 2 :(得分:1)
基于@RuiBarradas和@kon_u的答案,我设法解决了我的问题。由于两者都仅部分理解了该问题(部分原因是由于我对问题描述的阐述不够清楚),因此我在这里分享了完整的解决方案:
result<-df1 %>%
left_join(df2, by = c("user_id", "place_id")) %>%
mutate(Diff = difftime(time2, time1, units = "secs"),
Diff = as.numeric(Diff)) %>%
filter(Diff > 0) %>%
arrange(user_id, place_id, time1,time2) %>%
group_by(user_id, place_id,time2) %>%
filter(Diff==min(Diff)) %>%
right_join(df1,by=c("user_id", "place_id","time1"))