在给定的数据框中
df2 <- data.frame(id= c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "D", "D", "D", "D", "E"),
session =c("XY1", "XY2", "XY3", "XY4", "XY5", "XY6", "XY7", "XY8", "XY9", "XY10", "XY11", "XY12", "XY13", "XY14", "XY15", "XY16") ,
start=c("2017-10-28 14:39:09", "2017-10-28 14:54:15", "2017-10-28 17:57:38", "2017-10-29 6:18:18", "2017-10-29 9:57:33", "2017-10-29 21:35:36", "2017-10-29 5:26:57", "2017-10-29 5:33:44", "2017-10-29 15:37:25", "2017-10-29 18:21:13", "2017-10-29 18:26:33", "2017-10-29 5:41:00", "2017-10-29 16:52:54", "2017-10-29 16:56:52", "2017-10-29 4:10:31", "2017-10-28 2:45:49"),
end=c("2017-10-28 14:39:10", "2017-10-28 16:16:02", "2017-10-28 18:01:57", "2017-10-29 6:18:20", "2017-10-29 10:05:13", "2017-10-29 21:36:37", "2017-10-29 5:30:43", "2017-10-29 5:33:44", "2017-10-29 15:37:29", "2017-10-29 18:23:15", "2017-10-29 18:26:33", "2017-10-29 5:45:17", "2017-10-29 16:52:55", "2017-10-29 16:57:09", "2017-10-29 4:52:01", "2017-10-29 3:54:39"),
diff =c(-1, 905, 6096, 44181, 13153, 41423, -1, 181, 36221, 9824, 198, -1, 38, 237, -1, -1))
列diff
是上一个会话结束与当前会话开始之间的差异,如果id
发生更改,则值为-1。
如果diff
小于1800,即30分钟,我们的目标是合并会话,所以期望的输出是
data.frame(id= c("A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B", "D", "D", "D", "D", "E"),
session =c("XY1", "XY2", "XY3", "XY4", "XY5", "XY6", "XY7", "XY8", "XY9", "XY10", "XY11", "XY12", "XY13", "XY14", "XY15", "XY16") ,
start=c("2017-10-28 14:39:09", "2017-10-28 14:54:15", "2017-10-28 17:57:38", "2017-10-29 6:18:18", "2017-10-29 9:57:33", "2017-10-29 21:35:36", "2017-10-29 5:26:57", "2017-10-29 5:33:44", "2017-10-29 15:37:25", "2017-10-29 18:21:13", "2017-10-29 18:26:33", "2017-10-29 5:41:00", "2017-10-29 16:52:54", "2017-10-29 16:56:52", "2017-10-29 4:10:31", "2017-10-28 2:45:49"),
end=c("2017-10-28 14:39:10", "2017-10-28 16:16:02", "2017-10-28 18:01:57", "2017-10-29 6:18:20", "2017-10-29 10:05:13", "2017-10-29 21:36:37", "2017-10-29 5:30:43", "2017-10-29 5:33:44", "2017-10-29 15:37:29", "2017-10-29 18:23:15", "2017-10-29 18:26:33", "2017-10-29 5:45:17", "2017-10-29 16:52:55", "2017-10-29 16:57:09", "2017-10-29 4:52:01", "2017-10-29 3:54:39"),
diff =c(-1, 905, 6096, 44181, 13153, 41423, -1, 181, 36221, 9824, 198, -1, 38, 237, -1, -1),
new_session=c("XY1", "XY1", "XY3", "XY4", "XY5", "XY6", "XY7", "XY7", "XY9", "XY10", "XY10", "XY12", "XY12", "XY12", "XY15", "XY16"))
我尝试了循环和它的工作,但是花了很多时间
for (i in 1:nrow(df2)) {
df2$new_session[i] <- ifelse(df2[i,"diff"]<=1800 & df2[i,"diff"]>=0,
df2$new_session[i-1],
df2$session[i])
}
我尝试使用dplyr,但它不起作用,任何更快的解决方案都非常有用
df2 <- df2 %>%
mutate(n_session = ifelse(diff<=1800 & diff>=0,lag(session),session))
答案 0 :(得分:0)
Creating user sessions with fast computation
require(data.table)
setDT(df2)
df2[, prev_session := ifelse(diff <= 1800 & diff >= 0, 1L, 0L)]
df2[, session_ind := diffinv(!prev_session)[-1]]
df2[, new_session := first(session), by = session_ind]
df2[, `:=`(prev_session = NULL, session_ind = NULL)]
df2
# id session start end diff new_session
# 1: A XY1 2017-10-28 14:39:09 2017-10-28 14:39:10 -1 XY1
# 2: A XY2 2017-10-28 14:54:15 2017-10-28 16:16:02 905 XY1
# 3: A XY3 2017-10-28 17:57:38 2017-10-28 18:01:57 6096 XY3
# 4: A XY4 2017-10-29 6:18:18 2017-10-29 6:18:20 44181 XY4
# 5: A XY5 2017-10-29 9:57:33 2017-10-29 10:05:13 13153 XY5
# 6: A XY6 2017-10-29 21:35:36 2017-10-29 21:36:37 41423 XY6
# 7: B XY7 2017-10-29 5:26:57 2017-10-29 5:30:43 -1 XY7
# 8: B XY8 2017-10-29 5:33:44 2017-10-29 5:33:44 181 XY7
# 9: B XY9 2017-10-29 15:37:25 2017-10-29 15:37:29 36221 XY9
# 10: B XY10 2017-10-29 18:21:13 2017-10-29 18:23:15 9824 XY10
# 11: B XY11 2017-10-29 18:26:33 2017-10-29 18:26:33 198 XY10
# 12: D XY12 2017-10-29 5:41:00 2017-10-29 5:45:17 -1 XY12
# 13: D XY13 2017-10-29 16:52:54 2017-10-29 16:52:55 38 XY12
# 14: D XY14 2017-10-29 16:56:52 2017-10-29 16:57:09 237 XY12
# 15: D XY15 2017-10-29 4:10:31 2017-10-29 4:52:01 -1 XY15
# 16: E XY16 2017-10-28 2:45:49 2017-10-29 3:54:39 -1 XY16
答案 1 :(得分:0)
使用dplyr
library(dplyr)
library(zoo)
df <- df2 %>%
mutate(new_session = ifelse(diff <= 1800 & diff >= 0, NA, as.character(session)))
df$new_session <- na.locf(df$new_session)
df
输出是:
id session start end diff new_session
1 A XY1 2017-10-28 14:39:09 2017-10-28 14:39:10 -1 XY1
2 A XY2 2017-10-28 14:54:15 2017-10-28 16:16:02 905 XY1
3 A XY3 2017-10-28 17:57:38 2017-10-28 18:01:57 6096 XY3
4 A XY4 2017-10-29 6:18:18 2017-10-29 6:18:20 44181 XY4
5 A XY5 2017-10-29 9:57:33 2017-10-29 10:05:13 13153 XY5
6 A XY6 2017-10-29 21:35:36 2017-10-29 21:36:37 41423 XY6
7 B XY7 2017-10-29 5:26:57 2017-10-29 5:30:43 -1 XY7
8 B XY8 2017-10-29 5:33:44 2017-10-29 5:33:44 181 XY7
9 B XY9 2017-10-29 15:37:25 2017-10-29 15:37:29 36221 XY9
10 B XY10 2017-10-29 18:21:13 2017-10-29 18:23:15 9824 XY10
11 B XY11 2017-10-29 18:26:33 2017-10-29 18:26:33 198 XY10
12 D XY12 2017-10-29 5:41:00 2017-10-29 5:45:17 -1 XY12
13 D XY13 2017-10-29 16:52:54 2017-10-29 16:52:55 38 XY12
14 D XY14 2017-10-29 16:56:52 2017-10-29 16:57:09 237 XY12
15 D XY15 2017-10-29 4:10:31 2017-10-29 4:52:01 -1 XY15
16 E XY16 2017-10-28 2:45:49 2017-10-29 3:54:39 -1 XY16