对于每个参与者和每个试验,我需要检查CURRENT_ID中的所有连续行,第一行在A列中的值为0,最后一行在B列中的值为0。两个条件都满足,我想在新列C中的值为0,如果不是,我希望值为1.
如果您有任何建议,我将非常感激。
以下是数据的一些示例行:
A B participant trial CURRENT_ID C
0 1 ppt01 45 3 0
1 0 ppt01 45 4 0
0 1 ppt01 45 10 0
0 0 ppt01 45 11 0
1 0 ppt01 45 12 0
0 1 ppt01 87 2 0
1 0 ppt01 87 3 0
1 1 ppt01 87 4 1
1 1 ppt01 87 5 1
0 1 ppt01 34 6 0
0 0 ppt01 34 7 0
0 0 ppt01 34 8 0
0 0 ppt01 34 9 0
0 0 ppt01 34 10 0
1 0 ppt01 34 11 0
0 1 ppt01 8 5 0
1 0 ppt01 8 6 0
0 1 ppt01 8 9 0
0 0 ppt01 8 10 0
0 0 ppt01 8 11 0
1 0 ppt01 8 12 0
0 1 ppt02 87 2 0
0 0 ppt02 87 3 0
0 0 ppt02 87 4 0
1 0 ppt02 87 5 0
0 1 ppt02 55 5 0
1 0 ppt02 55 6 0
0 1 ppt02 55 9 0
1 0 ppt02 55 10 0
0 1 ppt02 55 11 1
1 0 ppt02 55 12 0
0 1 ppt02 22 2 0
1 0 ppt02 22 3 0
0 1 ppt02 22 4 1
0 1 ppt02 22 10 0
1 0 ppt02 22 11 1
1 1 ppt02 22 12 1
编辑:我需要考虑每个参与者和试验的每对连续行(基于CURRENT_ID的值连续)。在上面的示例中,第8行和第9行在新列C中的值为1,因为第8行在第A列中有1(而不是0),第9行在第B行中有1(而不是0)。 / p>
A B participant trial CURRENT_ID C
1 1 ppt01 87 4 1
1 1 ppt01 87 5 1
EDIT2:遵循我需要考虑的行对:
A B participant trial CURRENT_ID C
0 1 ppt01 45 3 0
1 0 ppt01 45 4 0
0 1 ppt01 45 10 0
0 0 ppt01 45 11 0
0 0 ppt01 45 11 0
1 0 ppt01 45 12 0
0 1 ppt01 87 2 0
1 0 ppt01 87 3 0
1 0 ppt01 87 3 0
1 1 ppt01 87 4 1
1 1 ppt01 87 4 1
1 1 ppt01 87 5 1
0 1 ppt01 34 6 0
0 0 ppt01 34 7 0
0 0 ppt01 34 7 0
0 0 ppt01 34 8 0
0 0 ppt01 34 8 0
0 0 ppt01 34 9 0
0 0 ppt01 34 9 0
0 0 ppt01 34 10 0
0 0 ppt01 34 10 0
1 0 ppt01 34 11 0
答案 0 :(得分:3)
一个古老的问题,但仍然有意义,尚未得到公认的答案!
问题并非完全清楚,但我发现了两种解释方法。
1)如果B == 1
和lag(A) == 1
2)如果A == 1
和B == 1
在至少连续两行中为TRUE。
我正在尝试为解释OP的两种方法提供解决方案。
library(tidyverse)
mydf2 <- mydf %>%
group_by(participant, trial) %>%
mutate(consec_rows = cumsum(c(1, diff(CURRENT_ID) != 1))) %>%
group_by(participant, trial, consec_rows) %>%
mutate(cond_consec = B & lag(A))
as.data.frame(mydf2)
#> A B participant trial CURRENT_ID C consec_rows cond_consec
#> 1 0 1 ppt01 45 3 0 1 NA
#> 2 1 0 ppt01 45 4 0 1 FALSE
#> 3 0 1 ppt01 45 10 0 2 NA
#> 4 0 0 ppt01 45 11 0 2 FALSE
#> 5 1 0 ppt01 45 12 0 2 FALSE
#> 6 0 1 ppt01 87 2 0 1 NA
#> 7 1 0 ppt01 87 3 0 1 FALSE
#> 8 1 1 ppt01 87 4 1 1 TRUE
#> 9 1 1 ppt01 87 5 1 1 TRUE
#> 10 0 1 ppt01 34 6 0 1 NA
#> 11 0 0 ppt01 34 7 0 1 FALSE
#> 12 0 0 ppt01 34 8 0 1 FALSE
#> 13 0 0 ppt01 34 9 0 1 FALSE
#> 14 0 0 ppt01 34 10 0 1 FALSE
#> 15 1 0 ppt01 34 11 0 1 FALSE
#> 16 0 1 ppt01 8 5 0 1 NA
#> 17 1 0 ppt01 8 6 0 1 FALSE
#> 18 0 1 ppt01 8 9 0 2 NA
#> 19 0 0 ppt01 8 10 0 2 FALSE
#> 20 0 0 ppt01 8 11 0 2 FALSE
#> 21 1 0 ppt01 8 12 0 2 FALSE
#> 22 0 1 ppt02 87 2 0 1 NA
#> 23 0 0 ppt02 87 3 0 1 FALSE
#> 24 0 0 ppt02 87 4 0 1 FALSE
#> 25 1 0 ppt02 87 5 0 1 FALSE
#> 26 0 1 ppt02 55 5 0 1 NA
#> 27 1 0 ppt02 55 6 0 1 FALSE
#> 28 0 1 ppt02 55 9 0 2 NA
#> 29 1 0 ppt02 55 10 0 2 FALSE
#> 30 0 1 ppt02 55 11 1 2 TRUE
#> 31 1 0 ppt02 55 12 0 2 FALSE
#> 32 0 1 ppt02 22 2 0 1 NA
#> 33 1 0 ppt02 22 3 0 1 FALSE
#> 34 0 1 ppt02 22 4 1 1 TRUE
#> 35 0 1 ppt02 22 10 0 2 NA
#> 36 1 0 ppt02 22 11 1 2 FALSE
#> 37 1 1 ppt02 22 12 1 2 TRUE
据我了解的操作规范,我认为第36行中所需的C == 1实际上应该为零。
reprex package(v0.3.0)于2020-05-16创建
或者,如果条件是连续行中的A == 1和B == 1:
mydf %>%
group_by(participant, trial, consec = cumsum(c(1, diff(CURRENT_ID) != 1))) %>%
mutate(cond_consec =
rep(rle(A & B)$values & rle(A & B)$lengths >= 2, rle(A & B)$lengths))
#> # A tibble: 37 x 8
#> # Groups: participant, trial, consec [11]
#> A B participant trial CURRENT_ID C consec cond_consec
#> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <lgl>
#> 1 0 1 ppt01 45 3 0 1 FALSE
#> 2 1 0 ppt01 45 4 0 1 FALSE
#> 3 0 1 ppt01 45 10 0 2 FALSE
#> 4 0 0 ppt01 45 11 0 2 FALSE
#> 5 1 0 ppt01 45 12 0 2 FALSE
#> 6 0 1 ppt01 87 2 0 3 FALSE
#> 7 1 0 ppt01 87 3 0 3 FALSE
#> 8 1 1 ppt01 87 4 1 3 TRUE
#> 9 1 1 ppt01 87 5 1 3 TRUE
#> 10 0 1 ppt01 34 6 0 3 FALSE
说明第1部分
==
运算符。B & lag(A)
等效于B == 1 & lag(A) == 1
第2部分(如果A == 1 & B == 1
在连续的行中)
rle
的条件语句并检查rle $ length是否为TRUE
> = 2 答案 1 :(得分:2)
Base R解决方案:
mydf$grouping_vec <- with(mydf, paste(participant, trial,
ave(CURRENT_ID, participant, trial, FUN = function(x){
cumsum(c(1, ifelse(diff(x) > 1, 1, 0)))
}), sep = " - "))
data.frame(do.call("rbind",
lapply(split(mydf, mydf$grouping_vec),
function(x) {
data.frame(cbind(x[, names(x) != "C"],
C = rep(if(x$A[1] == 0 & x$B[nrow(x)] == 0){0}else{1}, nrow(x))))}
)),
row.names = NULL)
Tidyverse解决方案:
library(tidyverse)
mydf %>%
mutate(grouping_vec = str_c(participant,
trial,
cumsum(c(1, diff(CURRENT_ID) != 1))),
sep = " - ")) %>%
group_by(grouping_vec) %>%
mutate(C = if_else(first(A) == 0 & last(B) == 0, 0, 1)) %>%
ungroup() %>%
select(-grouping_vec)
答案 2 :(得分:1)
如果您想在参与者试验组中对AB组进行分组,这应该有效:
d %>% group_by(participant, trial) %>% mutate(AB = ceiling(1:n()/2)) %>% group_by(participant, trial, AB) %>% mutate(newC = ifelse(length(A) == 1 | (A[1] == 0 & B[2] == 0), 0, 1))
我已经离开了新列,因此您可以看到这是如何完成的。
输出:
# A tibble: 15 x 8
A B participant trial CURRENT_ID C AB newC
<int> <int> <chr> <int> <int> <int> <dbl> <dbl>
1 0 1 ppt01 45 3 0 1 0
2 1 0 ppt01 45 4 0 1 0
3 0 1 ppt01 45 10 0 2 0
4 0 0 ppt01 45 11 0 2 0
5 1 0 ppt01 45 12 0 3 0
6 0 1 ppt01 87 2 0 1 0
7 1 0 ppt01 87 3 0 1 0
8 1 1 ppt01 87 4 1 2 1
9 1 1 ppt01 87 5 1 2 1
10 0 1 ppt01 34 6 0 1 0
11 0 0 ppt01 34 7 0 1 0
12 0 0 ppt01 34 8 0 2 0
13 0 0 ppt01 34 9 0 2 0
14 0 0 ppt01 34 10 0 3 0
15 1 0 ppt01 34 11 0 3 0
否则,正如最初描述的那样:
require(dplyr)
d %>% group_by(participant, trial) %>% mutate(newC = ifelse(A[1] == 0 & B[n()] == 0, 0, 1))
输出:
Source: local data frame [15 x 7]
Groups: participant, trial [3]
# A tibble: 15 x 7
A B participant trial CURRENT_ID C newC
<int> <int> <chr> <int> <int> <int> <dbl>
1 0 1 ppt01 45 3 0 0
2 1 0 ppt01 45 4 0 0
3 0 1 ppt01 45 10 0 0
4 0 0 ppt01 45 11 0 0
5 1 0 ppt01 45 12 0 0
6 0 1 ppt01 87 2 0 1
7 1 0 ppt01 87 3 0 1
8 1 1 ppt01 87 4 1 1
9 1 1 ppt01 87 5 1 1
10 0 1 ppt01 34 6 0 0
11 0 0 ppt01 34 7 0 0
12 0 0 ppt01 34 8 0 0
13 0 0 ppt01 34 9 0 0
14 0 0 ppt01 34 10 0 0
15 1 0 ppt01 34 11 0 0
我使用dput()
使用了您的数据子集:
d <- structure(
list(
A = c(0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L),
B = c(1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L,
1L, 0L, 0L, 0L, 0L, 0L),
participant = c(
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01",
"ppt01"
),
trial = c(
45L,
45L,
45L,
45L,
45L,
87L,
87L,
87L,
87L,
34L,
34L,
34L,
34L,
34L,
34L
),
CURRENT_ID = c(3L, 4L, 10L, 11L, 12L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L),
C = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L)
),
.Names = c("A", "B", "participant",
"trial", "CURRENT_ID", "C"),
class = "data.frame",
row.names = c(NA,-15L)
)