我想遍历以下data.frame并按由X2中的值所确定的顺序条目进行分组。因此,在以下data.frame中,我们可以看到四个组:1-3、5-6、9-13和16。我们可以具有组大小和组数的任意组合。
X1 X2 X3 X4
1 1_21/08/2014 22:56CONTENT_ACCESS.preparing 1 21/08/2014 22:56 CONTENT_ACCESS.preparing
2 2_21/08/2014 22:57CONTENT_ACCESS.preparing 2 21/08/2014 22:57 CONTENT_ACCESS.preparing
3 3_21/08/2014 22:58CONTENT_ACCESS.preparing 3 21/08/2014 22:58 CONTENT_ACCESS.preparing
4 5_21/08/2014 23:07CONTENT_ACCESS.preparing 5 21/08/2014 23:07 CONTENT_ACCESS.preparing
5 6_21/08/2014 23:08CONTENT_ACCESS.preparing 6 21/08/2014 23:08 CONTENT_ACCESS.preparing
6 9_21/08/2014 23:29CONTENT_ACCESS.preparing 9 21/08/2014 23:29 CONTENT_ACCESS.preparing
7 10_21/08/2014 23:30CONTENT_ACCESS.preparing 10 21/08/2014 23:30 CONTENT_ACCESS.preparing
8 11_21/08/2014 23:31CONTENT_ACCESS.preparing 11 21/08/2014 23:31 CONTENT_ACCESS.preparing
9 12_21/08/2014 23:33CONTENT_ACCESS.preparing 12 21/08/2014 23:33 CONTENT_ACCESS.preparing
10 13_21/08/2014 23:34CONTENT_ACCESS.preparing 13 21/08/2014 23:34 CONTENT_ACCESS.preparing
11 16_21/08/2014 23:40CONTENT_ACCESS.preparing 16 21/08/2014 23:40 CONTENT_ACCESS.preparing
我想捕获X3中的时间戳,以便它们可以描述时间范围(即每个组的第一个和最后一个时间戳)并产生此输出。在每个组中,start_ts是第一个时间戳,stop_ts是最后一个时间戳:
student_id session_id start_ts stop_ts week micro_process
1 4 16 21/08/2014 22:56 21/08/2014 22:58 4 TASK
2 4 16 21/08/2014 23:07 21/08/2014 23:08 4 TASK
3 4 16 21/08/2014 23:29 21/08/2014 23:34 4 TASK
3 4 16 21/08/2014 23:40 21/08/2014 23:40 4 TASK
我还没有尝试过循环,但是想看看如何在不使用传统循环的情况下进行循环。我的代码目前仅捕获整个组的范围:
student_id session_id start_ts stop_ts week micro_process
1 4 16 21/08/2014 22:58 21/08/2014 23:30 4 TASK
在我的示例中,其他变量(学生ID等)已被虚拟化,并没有严格意义,但是为了完整起见,我想保留它们。
代码(可以直接运行):
library(stringr)
options(stringsAsFactors = FALSE)
eventised_session <- data.frame(student_id=integer(),
session_id=integer(),
start_ts=character(),
stop_ts=character(),
week=integer(),
micro_process=character())
string_match.df <- structure(list(X1 = c("1_21/08/2014 22:56CONTENT_ACCESS.preparing",
"2_21/08/2014 22:57CONTENT_ACCESS.preparing", "3_21/08/2014 22:58CONTENT_ACCESS.preparing",
"5_21/08/2014 23:07CONTENT_ACCESS.preparing", "6_21/08/2014 23:08CONTENT_ACCESS.preparing",
"9_21/08/2014 23:29CONTENT_ACCESS.preparing", "10_21/08/2014 23:30CONTENT_ACCESS.preparing",
"11_21/08/2014 23:31CONTENT_ACCESS.preparing", "12_21/08/2014 23:33CONTENT_ACCESS.preparing",
"13_21/08/2014 23:34CONTENT_ACCESS.preparing", "16_21/08/2014 23:40CONTENT_ACCESS.preparing"
), X2 = c("1", "2", "3", "5", "6", "9", "10", "11", "12", "13",
"16"), X3 = c("21/08/2014 22:56", "21/08/2014 22:57", "21/08/2014 22:58",
"21/08/2014 23:07", "21/08/2014 23:08", "21/08/2014 23:29", "21/08/2014 23:30",
"21/08/2014 23:31", "21/08/2014 23:33", "21/08/2014 23:34", "21/08/2014 23:40"
), X4 = c("CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing",
"CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing",
"CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing",
"CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing", "CONTENT_ACCESS.preparing"
)), .Names = c("X1", "X2", "X3", "X4"), row.names = c(NA, -11L
), class = "data.frame")
r_student_id <- 4
r_session_id <- 16
r_week <- 4
r_mic_proc <- "TASK"
string_match.df
#Get the first and last timestamp in matched sequence
r_start_ts <- string_match.df[1, ncol(string_match.df)-1]
r_stop_ts <- string_match.df[nrow(string_match.df), ncol(string_match.df)-1]
eventised_session[nrow(eventised_session)+1,] <- c(r_student_id, r_session_id, r_start_ts, r_stop_ts, r_week, r_mic_proc)
eventised_session
非常感谢您在这方面的专业知识。我只使用过传统的循环。
答案 0 :(得分:1)
我们转换为数字,减去一个序列,以便相邻的数字将转换为相同的数字。由于您没有提供与示例数据名称不同的所需输出和引用列名称,因此我猜测最终结果(基于其他答案):
string_match.df$X2 = as.numeric(string_match.df$X2)
string_match.df$grp = string_match.df$X2 - 1:nrow(string_match.df)
string_match.df
library(dplyr)
string_match.df %>%
group_by(grp) %>%
summarize(start = first(X3), stop = last(X3))
# grp start stop
# <dbl> <chr> <chr>
# 1 0 21/08/2014 22:56 21/08/2014 22:58
# 2 1 21/08/2014 23:07 21/08/2014 23:08
# 3 3 21/08/2014 23:29 21/08/2014 23:34
# 4 5 21/08/2014 23:40 21/08/2014 23:40
作为旁注,请小心使用“矩阵”一词。您在问题中使用了矩阵标签并多次使用了单词matrix
,但是您没有matrix
,也不应使用它。您有一个data.frame
。在matrix
中,所有数据都必须是同一类型。在数据框中,列可以具有不同的类型。在这里,您有一个数字列,两个字符串列和一个datetime列,因此矩阵是一个不好的选择。每个列都可以属于适当类的数据框要好得多。
答案 1 :(得分:1)
我为数据使用了一个较短的名称,并将df $ X2转换为数字:
df <- string_match.df # as defined in OP
df$X2 <- as.numeric(df$X2)
您可以使用cumsum
和diff
的组合来拆分数据帧:
cumsum(diff(c(0,as.numdf$X2))>1)
# [1] 0 0 0 1 1 2 2 2 2 2 3
# presumes that df$X2[1] is 1, but you can easily make up a general case:
# cumsum(diff(c(df$X2[1]-1,df$X2))>1)
现在只需使用split
和lapply
:
do.call(rbind,lapply(split(df, cumsum(diff(c(0,df$X2))>1)), function(x) {foo <- x$X3; data.frame(start_ts=foo[1], stop_ts=tail(foo,1))}))
# output:
start_ts stop_ts
0 21/08/2014 22:56 21/08/2014 22:58
1 21/08/2014 23:07 21/08/2014 23:08
2 21/08/2014 23:29 21/08/2014 23:34
3 21/08/2014 23:40 21/08/2014 23:40
剩下的就是按照您希望的格式格式化输出的问题。
答案 2 :(得分:0)
您可以在tidyverse
中轻松完成新问题。您要做的主要事情是根据timestamp
变量将观察结果分成几组。我假设规则是自上次观察以来超过2分钟会开始一个新小组。您可以根据需要轻松地进行更改。
将观察结果分组后,您可以简单地使用summarize
来按组返回计算结果(在这种情况下,第一个和最后一个时间点):
library(dplyr)
library(lubridate)
string_match.df %>%
select('id' = X2, # Select and rename variables
'timestamp' = X3) %>%
mutate(timestamp = dmy_hm(timestamp), # Parse timestamp as date
time_diff = timestamp - lag(timestamp), # Calculate time from last obs
new_obs = time_diff > 2) | # New obs. if >2 min from last one
is.na(time_diff), # or, if it's the 1st obs.
group_id = cumsum(new_obs)) %>% # Count new groups for group ID
group_by(group_id) %>% # Group by 'group_id'
summarize(start_ts = min(timestamp), # Then return the first and last
stop_ts = max(timestamp)) # timestamps for each group
# A tibble: 4 x 3
group_id start_ts stop_ts
<int> <dttm> <dttm>
1 1 2014-08-21 22:56:00 2014-08-21 22:58:00
2 2 2014-08-21 23:07:00 2014-08-21 23:08:00
3 3 2014-08-21 23:29:00 2014-08-21 23:34:00
4 4 2014-08-21 23:40:00 2014-08-21 23:40:00
由于您的问题中没有关于如何确定student_id
,session_id
,week
和micro_process
的讨论,因此我从示例中省略了它们。之后,您可以轻松地将它们添加到表中,或者如果新规则是通过分析组的数据确定的,则可以将它们添加到summarize
调用中。