如何使用R中相同列的前一行的顺序更新结果更新行值

时间:2017-11-23 22:30:04

标签: r performance for-loop dplyr

在给定的数据框中

df2 <- data.frame(id= c("A",    "A",    "A",    "A",    "A",    "A",    "B",    "B",    "B",    "B",    "B",    "D",    "D",    "D",    "D",    "E"), 
                session =c("XY1",   "XY2",  "XY3",  "XY4",  "XY5",  "XY6",  "XY7",  "XY8",  "XY9",  "XY10", "XY11", "XY12", "XY13", "XY14", "XY15", "XY16") , 
                start=c("2017-10-28 14:39:09",  "2017-10-28 14:54:15",  "2017-10-28 17:57:38",  "2017-10-29 6:18:18",   "2017-10-29 9:57:33",   "2017-10-29 21:35:36",  "2017-10-29 5:26:57",   "2017-10-29 5:33:44",   "2017-10-29 15:37:25",  "2017-10-29 18:21:13",  "2017-10-29 18:26:33",  "2017-10-29 5:41:00",   "2017-10-29 16:52:54",  "2017-10-29 16:56:52",  "2017-10-29 4:10:31",   "2017-10-28 2:45:49"),
                end=c("2017-10-28 14:39:10",    "2017-10-28 16:16:02",  "2017-10-28 18:01:57",  "2017-10-29 6:18:20",   "2017-10-29 10:05:13",  "2017-10-29 21:36:37",  "2017-10-29 5:30:43",   "2017-10-29 5:33:44",   "2017-10-29 15:37:29",  "2017-10-29 18:23:15",  "2017-10-29 18:26:33",  "2017-10-29 5:45:17",   "2017-10-29 16:52:55",  "2017-10-29 16:57:09",  "2017-10-29 4:52:01",   "2017-10-29 3:54:39"),
                diff =c(-1, 905,    6096,   44181,  13153,  41423,  -1, 181,    36221,  9824,   198,    -1, 38, 237,    -1, -1))

diff是上一个会话结束与当前会话开始之间的差异,如果id发生更改,则值为-1。

如果diff小于1800,即30分钟,我们的目标是合并会话,所以期望的输出是

data.frame(id= c("A",   "A",    "A",    "A",    "A",    "A",    "B",    "B",    "B",    "B",    "B",    "D",    "D",    "D",    "D",    "E"), 
           session =c("XY1",    "XY2",  "XY3",  "XY4",  "XY5",  "XY6",  "XY7",  "XY8",  "XY9",  "XY10", "XY11", "XY12", "XY13", "XY14", "XY15", "XY16") , 
           start=c("2017-10-28 14:39:09",   "2017-10-28 14:54:15",  "2017-10-28 17:57:38",  "2017-10-29 6:18:18",   "2017-10-29 9:57:33",   "2017-10-29 21:35:36",  "2017-10-29 5:26:57",   "2017-10-29 5:33:44",   "2017-10-29 15:37:25",  "2017-10-29 18:21:13",  "2017-10-29 18:26:33",  "2017-10-29 5:41:00",   "2017-10-29 16:52:54",  "2017-10-29 16:56:52",  "2017-10-29 4:10:31",   "2017-10-28 2:45:49"),
           end=c("2017-10-28 14:39:10", "2017-10-28 16:16:02",  "2017-10-28 18:01:57",  "2017-10-29 6:18:20",   "2017-10-29 10:05:13",  "2017-10-29 21:36:37",  "2017-10-29 5:30:43",   "2017-10-29 5:33:44",   "2017-10-29 15:37:29",  "2017-10-29 18:23:15",  "2017-10-29 18:26:33",  "2017-10-29 5:45:17",   "2017-10-29 16:52:55",  "2017-10-29 16:57:09",  "2017-10-29 4:52:01",   "2017-10-29 3:54:39"),
           diff =c(-1,  905,    6096,   44181,  13153,  41423,  -1, 181,    36221,  9824,   198,    -1, 38, 237,    -1, -1),
           new_session=c("XY1", "XY1",  "XY3",  "XY4",  "XY5",  "XY6",  "XY7",  "XY7",  "XY9",  "XY10", "XY10", "XY12", "XY12", "XY12", "XY15", "XY16"))

我尝试了循环和它的工作,但是花了很多时间

for (i in 1:nrow(df2)) {
  df2$new_session[i] <- ifelse(df2[i,"diff"]<=1800 & df2[i,"diff"]>=0,
                             df2$new_session[i-1],
                             df2$session[i])
  }

我尝试使用dplyr,但它不起作用,任何更快的解决方案都非常有用

df2 <- df2 %>%
  mutate(n_session = ifelse(diff<=1800 & diff>=0,lag(session),session))

2 个答案:

答案 0 :(得分:0)

Creating user sessions with fast computation

require(data.table)
setDT(df2)
df2[, prev_session := ifelse(diff <= 1800 & diff >= 0, 1L, 0L)]
df2[, session_ind := diffinv(!prev_session)[-1]]
df2[, new_session := first(session), by = session_ind]
df2[, `:=`(prev_session = NULL, session_ind = NULL)]
df2
#     id session               start                 end  diff new_session
#  1:  A     XY1 2017-10-28 14:39:09 2017-10-28 14:39:10    -1         XY1
#  2:  A     XY2 2017-10-28 14:54:15 2017-10-28 16:16:02   905         XY1
#  3:  A     XY3 2017-10-28 17:57:38 2017-10-28 18:01:57  6096         XY3
#  4:  A     XY4  2017-10-29 6:18:18  2017-10-29 6:18:20 44181         XY4
#  5:  A     XY5  2017-10-29 9:57:33 2017-10-29 10:05:13 13153         XY5
#  6:  A     XY6 2017-10-29 21:35:36 2017-10-29 21:36:37 41423         XY6
#  7:  B     XY7  2017-10-29 5:26:57  2017-10-29 5:30:43    -1         XY7
#  8:  B     XY8  2017-10-29 5:33:44  2017-10-29 5:33:44   181         XY7
#  9:  B     XY9 2017-10-29 15:37:25 2017-10-29 15:37:29 36221         XY9
# 10:  B    XY10 2017-10-29 18:21:13 2017-10-29 18:23:15  9824        XY10
# 11:  B    XY11 2017-10-29 18:26:33 2017-10-29 18:26:33   198        XY10
# 12:  D    XY12  2017-10-29 5:41:00  2017-10-29 5:45:17    -1        XY12
# 13:  D    XY13 2017-10-29 16:52:54 2017-10-29 16:52:55    38        XY12
# 14:  D    XY14 2017-10-29 16:56:52 2017-10-29 16:57:09   237        XY12
# 15:  D    XY15  2017-10-29 4:10:31  2017-10-29 4:52:01    -1        XY15
# 16:  E    XY16  2017-10-28 2:45:49  2017-10-29 3:54:39    -1        XY16

答案 1 :(得分:0)

使用dplyr

library(dplyr)
library(zoo)

df <- df2 %>%
  mutate(new_session = ifelse(diff <= 1800 & diff >= 0, NA, as.character(session)))
df$new_session <- na.locf(df$new_session)
df

输出是:

   id session               start                 end  diff new_session
1   A     XY1 2017-10-28 14:39:09 2017-10-28 14:39:10    -1         XY1
2   A     XY2 2017-10-28 14:54:15 2017-10-28 16:16:02   905         XY1
3   A     XY3 2017-10-28 17:57:38 2017-10-28 18:01:57  6096         XY3
4   A     XY4  2017-10-29 6:18:18  2017-10-29 6:18:20 44181         XY4
5   A     XY5  2017-10-29 9:57:33 2017-10-29 10:05:13 13153         XY5
6   A     XY6 2017-10-29 21:35:36 2017-10-29 21:36:37 41423         XY6
7   B     XY7  2017-10-29 5:26:57  2017-10-29 5:30:43    -1         XY7
8   B     XY8  2017-10-29 5:33:44  2017-10-29 5:33:44   181         XY7
9   B     XY9 2017-10-29 15:37:25 2017-10-29 15:37:29 36221         XY9
10  B    XY10 2017-10-29 18:21:13 2017-10-29 18:23:15  9824        XY10
11  B    XY11 2017-10-29 18:26:33 2017-10-29 18:26:33   198        XY10
12  D    XY12  2017-10-29 5:41:00  2017-10-29 5:45:17    -1        XY12
13  D    XY13 2017-10-29 16:52:54 2017-10-29 16:52:55    38        XY12
14  D    XY14 2017-10-29 16:56:52 2017-10-29 16:57:09   237        XY12
15  D    XY15  2017-10-29 4:10:31  2017-10-29 4:52:01    -1        XY15
16  E    XY16  2017-10-28 2:45:49  2017-10-29 3:54:39    -1        XY16