Question

我想遍历以下data.frame并按由X2中的值所确定的顺序条目进行分组。因此，在以下data.frame中，我们可以看到四个组：1-3、5-6、9-13和16。我们可以具有组大小和组数的任意组合。

                                            X1 X2               X3                       X4
1   1_21/08/2014 22:56CONTENT_ACCESS.preparing  1 21/08/2014 22:56 CONTENT_ACCESS.preparing
2   2_21/08/2014 22:57CONTENT_ACCESS.preparing  2 21/08/2014 22:57 CONTENT_ACCESS.preparing
3   3_21/08/2014 22:58CONTENT_ACCESS.preparing  3 21/08/2014 22:58 CONTENT_ACCESS.preparing
4   5_21/08/2014 23:07CONTENT_ACCESS.preparing  5 21/08/2014 23:07 CONTENT_ACCESS.preparing
5   6_21/08/2014 23:08CONTENT_ACCESS.preparing  6 21/08/2014 23:08 CONTENT_ACCESS.preparing
6   9_21/08/2014 23:29CONTENT_ACCESS.preparing  9 21/08/2014 23:29 CONTENT_ACCESS.preparing
7  10_21/08/2014 23:30CONTENT_ACCESS.preparing 10 21/08/2014 23:30 CONTENT_ACCESS.preparing
8  11_21/08/2014 23:31CONTENT_ACCESS.preparing 11 21/08/2014 23:31 CONTENT_ACCESS.preparing
9  12_21/08/2014 23:33CONTENT_ACCESS.preparing 12 21/08/2014 23:33 CONTENT_ACCESS.preparing
10 13_21/08/2014 23:34CONTENT_ACCESS.preparing 13 21/08/2014 23:34 CONTENT_ACCESS.preparing
11 16_21/08/2014 23:40CONTENT_ACCESS.preparing 16 21/08/2014 23:40 CONTENT_ACCESS.preparing

我想捕获X3中的时间戳，以便它们可以描述时间范围（即每个组的第一个和最后一个时间戳）并产生此输出。在每个组中，start_ts是第一个时间戳，stop_ts是最后一个时间戳：

student_id session_id start_ts           stop_ts             week micro_process
1          4         16 21/08/2014 22:56 21/08/2014 22:58    4          TASK
2          4         16 21/08/2014 23:07 21/08/2014 23:08    4          TASK
3          4         16 21/08/2014 23:29 21/08/2014 23:34    4          TASK
3          4         16 21/08/2014 23:40 21/08/2014 23:40    4          TASK

我还没有尝试过循环，但是想看看如何在不使用传统循环的情况下进行循环。我的代码目前仅捕获整个组的范围：

  student_id session_id         start_ts          stop_ts week micro_process
1          4         16 21/08/2014 22:58 21/08/2014 23:30    4          TASK

在我的示例中，其他变量（学生ID等）已被虚拟化，并没有严格意义，但是为了完整起见，我想保留它们。

代码（可以直接运行）：

library(stringr)
options(stringsAsFactors = FALSE) 

eventised_session <- data.frame(student_id=integer(),
                                session_id=integer(), 
                                start_ts=character(),
                                stop_ts=character(),
                                week=integer(),
                                micro_process=character())

string_match.df <- structure(list(X1 = c("1_21/08/2014 22:56CONTENT_ACCESS.preparing", 
                                         "2_21/08/2014 22:57CONTENT_ACCESS.preparing", "3_21/08/2014 22:58CONTENT_ACCESS.preparing", 
                                         "5_21/08/2014 23:07CONTENT_ACCESS.preparing", "6_21/08/2014 23:08CONTENT_ACCESS.preparing", 
                                         "9_21/08/2014 23:29CONTENT_ACCESS.preparing", "10_21/08/2014 23:30CONTENT_ACCESS.preparing", 
                                         "11_21/08/2014 23:31CONTENT_ACCESS.preparing", "12_21/08/2014 23:33CONTENT_ACCESS.preparing", 
                                         "13_21/08/2014 23:34CONTENT_ACCESS.preparing", "16_21/08/2014 23:40CONTENT_ACCESS.preparing"
), X2 = c("1", "2", "3", "5", "6", "9", "10", "11", "12", "13", 
          "16"), X3 = c("21/08/2014 22:56", "21/08/2014 22:57", "21/08/2014 22:58", 
                        "21/08/2014 23:07", "21/08/2014 23:08", "21/08/2014 23:29", "21/08/2014 23:30", 
                        "21/08/2014 23:31", "21/08/2014 23:33", "21/08/2014 23:34", "21/08/2014 23:40"
          ), X4 = c("CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", 
                    "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", 
                    "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", 
                    "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing"
          )), .Names = c("X1", "X2", "X3", "X4"), row.names = c(NA, -11L
          ), class = "data.frame")

r_student_id <- 4
r_session_id <- 16
r_week <- 4
r_mic_proc <- "TASK"

string_match.df

#Get the first and last timestamp in matched sequence
r_start_ts <- string_match.df[1, ncol(string_match.df)-1]
r_stop_ts <- string_match.df[nrow(string_match.df), ncol(string_match.df)-1]

eventised_session[nrow(eventised_session)+1,] <- c(r_student_id, r_session_id, r_start_ts, r_stop_ts, r_week, r_mic_proc)

eventised_session

非常感谢您在这方面的专业知识。我只使用过传统的循环。

Answer 1

我们转换为数字，减去一个序列，以便相邻的数字将转换为相同的数字。由于您没有提供与示例数据名称不同的所需输出和引用列名称，因此我猜测最终结果（基于其他答案）：

string_match.df$X2 = as.numeric(string_match.df$X2)
string_match.df$grp = string_match.df$X2 - 1:nrow(string_match.df)
string_match.df

library(dplyr)
string_match.df %>%
  group_by(grp) %>% 
  summarize(start = first(X3), stop = last(X3))
#     grp start            stop            
#   <dbl> <chr>            <chr>           
# 1     0 21/08/2014 22:56 21/08/2014 22:58
# 2     1 21/08/2014 23:07 21/08/2014 23:08
# 3     3 21/08/2014 23:29 21/08/2014 23:34
# 4     5 21/08/2014 23:40 21/08/2014 23:40

作为旁注，请小心使用“矩阵”一词。您在问题中使用了矩阵标签并多次使用了单词matrix，但是您没有matrix，也不应使用它。您有一个data.frame。在matrix中，所有数据都必须是同一类型。在数据框中，列可以具有不同的类型。在这里，您有一个数字列，两个字符串列和一个datetime列，因此矩阵是一个不好的选择。每个列都可以属于适当类的数据框要好得多。

Answer 2

我为数据使用了一个较短的名称，并将df $ X2转换为数字：

df <- string_match.df  # as defined in OP
df$X2 <- as.numeric(df$X2)

您可以使用cumsum和diff的组合来拆分数据帧：

cumsum(diff(c(0,as.numdf$X2))>1)
#  [1] 0 0 0 1 1 2 2 2 2 2 3
# presumes that df$X2[1] is 1, but you can  easily make up a general case:
#  cumsum(diff(c(df$X2[1]-1,df$X2))>1)

现在只需使用split和lapply：

do.call(rbind,lapply(split(df, cumsum(diff(c(0,df$X2))>1)), function(x) {foo <- x$X3; data.frame(start_ts=foo[1], stop_ts=tail(foo,1))}))
# output:
          start_ts          stop_ts
0 21/08/2014 22:56 21/08/2014 22:58
1 21/08/2014 23:07 21/08/2014 23:08
2 21/08/2014 23:29 21/08/2014 23:34
3 21/08/2014 23:40 21/08/2014 23:40

剩下的就是按照您希望的格式格式化输出的问题。

Answer 3

您可以在tidyverse中轻松完成新问题。您要做的主要事情是根据timestamp变量将观察结果分成几组。我假设规则是自上次观察以来超过2分钟会开始一个新小组。您可以根据需要轻松地进行更改。

将观察结果分组后，您可以简单地使用summarize来按组返回计算结果（在这种情况下，第一个和最后一个时间点）：

library(dplyr)
library(lubridate)

string_match.df %>%
    select('id' = X2,                              # Select and rename variables
           'timestamp' = X3) %>%
    mutate(timestamp = dmy_hm(timestamp),          # Parse timestamp as date
           time_diff = timestamp - lag(timestamp), # Calculate time from last obs
           new_obs = time_diff > 2) |              # New obs. if >2 min from last one
                     is.na(time_diff),             #   or, if it's the 1st obs.
           group_id = cumsum(new_obs)) %>%         # Count new groups for group ID
    group_by(group_id) %>%                         # Group by 'group_id'
    summarize(start_ts = min(timestamp),           # Then return the first and last
              stop_ts = max(timestamp))            #  timestamps for each group

# A tibble: 4 x 3
  group_id start_ts            stop_ts            
     <int> <dttm>              <dttm>             
1        1 2014-08-21 22:56:00 2014-08-21 22:58:00
2        2 2014-08-21 23:07:00 2014-08-21 23:08:00
3        3 2014-08-21 23:29:00 2014-08-21 23:34:00
4        4 2014-08-21 23:40:00 2014-08-21 23:40:00

由于您的问题中没有关于如何确定student_id，session_id，week和micro_process的讨论，因此我从示例中省略了它们。之后，您可以轻松地将它们添加到表中，或者如果新规则是通过分析组的数据确定的，则可以将它们添加到summarize调用中。

R中的高效数据帧循环

3 个答案: