Question

我有一个包含三列的数据框：＆＃34; uuid＆＃34; （即类因子）和＆＃34; created_at＆＃34; （即POSIXct类）和＆＃34; trainer_item_id＆＃34; （因素）我创建了一个名为＆＃34; Sessions＆＃34;的第三列。列Sessions表示按时间排序的每个uuid的时间会话，使得任何连续事件对之间的时间差最多为一小时（3600秒）。

我使用＆＃34; for循环＆＃34;创建了列Sessions和迭代。问题是我有超过一百万的观察结果，创建会话需要8个小时。有没有比我下面的代码更容易，更快捷的创建方法？在此先感谢您的帮助！

以下是原始数据集的示例 - ＆gt; https://gist.github.com/einsiol/5b4e633ce69d3a8e43252f383231e4b8

这是我的代码 - ＆gt;

library(dplyr)
    # Converting the data frame trial to tibble in order to use the function group_by
    trial <- tbl_df(trial); trial <- group_by(trial, uuid)

    # Ordering by timestamp (created_at)
    trial <- arrange(trial, created_at)

    # Creating empty vector of time difference tdiff
    time <- trial$created_at
    tdiff <- vector(mode = "numeric",length = 0)
    trial$Sessions <- vector(mode = "character",length = length(trial))

        count <-1

            for(i in 1:(length(trial$uuid)-1)) {

                tdiff[i] <- difftime(time[i+1], time[i],units = "secs")

                # If same user ID

                if (trial$uuid[i+1]==trial$uuid[i]){

                    if (tdiff[i]<3600){
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count


                    }else{
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count
                        count <- count+1
                    }

                    # If different user ID
                }else{

                    if (tdiff[i]<3600){
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count

                    }else{
                        trial$Sessions[i] <- count
                        trial$Sessions[i+1] <- count
                        count <- count+1
                    }

                    count <- 1
                }
            }

更新：我找到了我的问题的答案，以及您可以在下面找到的代码的快速替代方法！

Answer 1

因为你已经开始使用dplyr：

trial <- 

trial %>% 
  arrange(uuid, created_at) %>% 
  group_by(uuid) %>% 
  mutate(diff = difftime(created_at, lag(created_at), units = 'secs'), # calculate timediff for each row
    diff = as.numeric(diff >= 3600), # flags each new session with the number 1
    diff = ifelse(is.na(diff), 1, diff), %>% #replaces the first row of each group with 1 
    Sessions = cumsum(diff)) %>% #sum all the sessions for each group
  select(-diff) # remove diff column

Answer 2

您可以尝试使用data.table：

执行此操作

require(data.table)
N <- 4

trial <- data.table(uuid = rep(1:2, each = N),
                    created_at = as.POSIXct(60* 10 *rep(1:N, times = 2)*
                                              rep(1:N, times = 2),
                                            origin = "1990-01-01"))

setkey(trial, uuid, created_at)
trial
#    uuid          created_at
# 1:    1 1990-01-01 02:10:00
# 2:    1 1990-01-01 02:40:00
# 3:    1 1990-01-01 03:30:00
# 4:    1 1990-01-01 04:40:00
# 5:    2 1990-01-01 02:10:00
# 6:    2 1990-01-01 02:40:00
# 7:    2 1990-01-01 03:30:00
# 8:    2 1990-01-01 04:40:00

trial[, dif := c(1, as.numeric(diff(created_at), units = "secs"))]
trial[, ii := .GRP, by = uuid]
trial[, ii := ii - lag(ii)]
trial[is.na(ii), ii := 1L]
trial[, i := ifelse(dif < 3600, 0L, 1L)]
trial[ii == 1L, i := 0L]
trial[, Sessions := cumsum(i), by = uuid]
trial[, Sessions := Sessions + 1L, by = uuid]
trial
#    uuid          created_at   dif ii i Sessions
# 1:    1 1990-01-01 02:10:00     1  1 0        1
# 2:    1 1990-01-01 02:40:00  1800  0 0        1
# 3:    1 1990-01-01 03:30:00  3000  0 0        1
# 4:    1 1990-01-01 04:40:00  4200  0 1        2
# 5:    2 1990-01-01 02:10:00 -9000  1 0        1
# 6:    2 1990-01-01 02:40:00  1800  0 0        1
# 7:    2 1990-01-01 03:30:00  3000  0 0        1
# 8:    2 1990-01-01 04:40:00  4200  0 1        2

Answer 3

我找到了一种使用矢量微积分使其工作的非常有效和快速的方法。运行代码花了我30秒（而不是平均5小时！）

   library(data.table);library(sqldf)

        # Ordering by uuid and created_at
        LID<-LID[order(LID$uuid,LID$created_at),]

        # Computing time difference (sec) between the current and previous ligne 
        LID$created_at <- as.POSIXct(as.character(LID$created_at)) 
        LID$diff<-c(9999,LID$created_at[-1]-LID$created_at[-nrow(LID)])
        options(stringAsFactor = FALSE) 

        # Lines corresponding to a new uuid 
        w<-which(LID$uuid[-1]!=LID$uuid[-nrow(LID)])

        # Putting the duration to NA when there is a change of uuid
        LID$diff[w+1]<-9999

        # Identifying sessions changes that are greater than 3600 sec (1 hour)
        LID$chg_session<-as.numeric(LID$diff>3600)

        # Cumulating and determining the id_sessions with the inverse of Differencing
        LID$idsession<-diffinv(LID$chg_session)[-1]

通过快速计算创建用户会话

3 个答案: