我有来自IM聊天的数据,其中每一行都是一个按键。我想自动添加most_recent_enter
列(此处手动添加,以进行说明),该列跟踪keystroke
== ENTER
的最新行。
在此示例中,有重叠的消息,因此最后一个打ENTER
的用户不一定是最新的用户。我将在此处添加其他列,以显示我可以使用的信息。
x <- data.frame(overall_idx = 1:14,
sender=c("a","a","a","b","b","b","a",
"a","a","a","b","b","a","a"),
keystroke=c("H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T"),
ks_idx=c(0,1,2,0,1,2,0,1,2,3,0,1,3,3),
most_recent_enter=c(NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b")
)
是否有找到符合条件的最新行的方法?
编辑 在我的实际数据中,每个对话都标有一个实验编号。我将如何为每个实验重置此值?
x <- data.frame(exp_num=c(rep(1,14),rep(2,14)),
overall_idx = c(1:14,1:14),
sender=c("a","a","a","b","b","b","a",
"a","a","a","b","b","a","a",
"a","a","a","b","b","b","a",
"a","a","a","b","b","a","a"),
keystroke=c("H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T",
"H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T"),
ks_idx=c(0,1,2,0,1,2,0,1,2,3,0,1,3,3,0,1,2,0,1,2,0,1,2,3,0,1,3,3),
most_recent_enter=c(NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b",
NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b")
)
编辑2:
当第二个exp_num
中有两个不同的发件人时,所选答案实际上会失败。例如:
x <- data.frame(exp_num=c(rep(1,14),rep(2,14)),
overall_idx = c(1:14,1:14),
sender=c("a","a","a","b","b","b","a",
"a","a","a","b","b","a","a",
"c","c","c","d","d","d","c",
"c","c","c","d","d","c","c"),
keystroke=c("H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T",
"H","I","ENTER","K","I","ENTER",
"W","H","I","C","W","H","H","T"),
ks_idx=c(0,1,2,0,1,2,0,1,2,3,0,1,3,3,0,1,2,0,1,2,0,1,2,3,0,1,3,3),
most_recent_enter=c(NA,NA,NA,"a","a","a","b","b",
"b","b","b","b","b","b",
NA,NA,NA,"c","c","c","d","d",
"d","d","d","d","d","d")
在Exp 1和Exp 2中为a
产生相同的{{1} s和b
s,而不是most_recent_new_enter
和c
s >
答案 0 :(得分:1)
我们可以基于'ENTER'值创建分组列,以将'sender'的first
元素创建为most_recent
,然后在{{1}之后创建lag
}
ungroup
对于更新后的帖子,我们可以添加一个library(dplyr)
x %>%
group_by(grp = cumsum(keystroke == 'ENTER')) %>%
mutate(most_recent_new_enter = case_when(grp > 0 ~ first(sender))) %>%
ungroup %>%
mutate(most_recent_new_enter = lag(most_recent_new_enter)) %>%
select(-grp)
# A tibble: 14 x 6
# overall_idx sender keystroke ks_idx most_recent_enter most_recent_new_enter
# <int> <fct> <fct> <dbl> <fct> <fct>
# 1 1 a H 0 <NA> <NA>
# 2 2 a I 1 <NA> <NA>
# 3 3 a ENTER 2 <NA> <NA>
# 4 4 b K 0 a a
# 5 5 b I 1 a a
# 6 6 b ENTER 2 a a
# 7 7 a W 0 b b
# 8 8 a H 1 b b
# 9 9 a I 2 b b
#10 10 a C 3 b b
#11 11 b W 0 b b
#12 12 b H 1 b b
#13 13 a H 3 b b
#14 14 a T 3 b b
group_by
或使用x %>%
group_by(exp_num) %>%
group_by(grp = cumsum(keystroke == 'ENTER'), .add = TRUE) %>%
mutate(most_recent_new_enter = case_when(grp > 0 ~ first(sender))) %>%
group_by(exp_num) %>%
mutate(most_recent_new_enter = lag(most_recent_new_enter)) %>%
select(-grp) %>%
as.data.frame
#exp_num overall_idx sender keystroke ks_idx most_recent_enter most_recent_new_enter
#1 1 1 a H 0 <NA> <NA>
#2 1 2 a I 1 <NA> <NA>
#3 1 3 a ENTER 2 <NA> <NA>
#4 1 4 b K 0 a a
#5 1 5 b I 1 a a
#6 1 6 b ENTER 2 a a
#7 1 7 a W 0 b b
#8 1 8 a H 1 b b
#9 1 9 a I 2 b b
#10 1 10 a C 3 b b
#11 1 11 b W 0 b b
#12 1 12 b H 1 b b
#13 1 13 a H 3 b b
#14 1 14 a T 3 b b
#15 2 1 c H 0 <NA> <NA>
#16 2 2 c I 1 <NA> <NA>
#17 2 3 c ENTER 2 <NA> <NA>
#18 2 4 d K 0 c c
#19 2 5 d I 1 c c
#20 2 6 d ENTER 2 c c
#21 2 7 c W 0 d d
#22 2 8 c H 1 d d
#23 2 9 c I 2 d d
#24 2 10 c C 3 d d
#25 2 11 d W 0 d d
#26 2 12 d H 1 d d
#27 2 13 c H 3 d d
#28 2 14 c T 3 d d
中的fill
tidyr
或使用library(tidyr)
x %>%
mutate(most_recent_new_enter = lag(case_when(keystroke == 'ENTER' ~ sender))) %>%
fill(most_recent_new_enter)
data.table
答案 1 :(得分:1)
您也可以这样做:
x %>%
group_by(exp_num) %>%
mutate(most_recent_enter = sender[
sapply(1:n(),
function(x) max(which(keystroke == 'ENTER')[which(keystroke == 'ENTER') < x])
)]
)
输出(前几行):
# A tibble: 28 x 6
# Groups: exp_num [2]
exp_num overall_idx sender keystroke ks_idx most_recent_enter
<dbl> <int> <fct> <fct> <dbl> <fct>
1 1 1 a H 0 NA
2 1 2 a I 1 NA
3 1 3 a ENTER 2 NA
4 1 4 b K 0 a
5 1 5 b I 1 a
6 1 6 b ENTER 2 a
7 1 7 a W 0 b
8 1 8 a H 1 b
9 1 9 a I 2 b
10 1 10 a C 3 b
# ... with 18 more rows
基本上,您检查每个行号,该行号是与ENTER
相对应的最大索引,该索引仍低于当前行号,并使用它来对sender
进行子集化。