我有如下数据:
library(dplyr)
ex <- data.frame(bool = c(rep(FALSE, 2), rep(TRUE, 3), rep(FALSE, 2), rep(TRUE, 5),
FALSE, FALSE, rep(TRUE, 6), FALSE, FALSE, FALSE)) %>%
mutate(seq = data.table::rleid(bool)) %>%
group_by(seq) %>%
mutate(n = n()) %>%
ungroup() %>%
mutate(expected_output = c(4, 4, NA, NA, NA, 4, 4, rep(NA,5), 4, 4, rep(NA, 6), rep(6, 3)))
对于每个FALSE
,我需要找到长度至少为TRUE
的最新4
序列。但是,如果之前没有这样的序列(例如行1:2
或6:7
),我们应该进行检查,即找到观察后出现的长度为4或更大的第一个序列。
ex
的最后一列包含预期的输出。我该怎么做(最好使用tidyverse
)?
使用tidyverse
的解决方案仍然值得赞赏。
答案 0 :(得分:1)
以下内容应使用基数R。
function(col,min_seq =4)
{
end = c(which(c(col[-1],NA)!=col),length(col))
num = diff(c(0,end))
start = end-num+1
seq_n = seq_along(start)
v=col[end]
accept = num >= min_seq & v
st = start[accept]
sn = seq_n[accept]
en = end[accept]
en_ = en
en_[1]=1
place = rep(sn, diff(c(en_,length(col) + 1 ))) # If row with start of sequence is wanted instead of sequence number sn can be replaced with st
place[col]=NA
return(place)
}
答案 1 :(得分:1)
您可以这样做:
定义功能 :(可靠且具有错误处理功能)
fun1<-
function(vec, min_rep = 4) {
stopifnot(length(vec)>0, all(vec %in% 0:1))
runL <- do.call(rbind,rle(vec))
lngth<- ncol(runL)
runL <- rbind(runL, seq = 1:lngth, seq2 = NA^runL[2,])
runL[3,] <- ifelse(!runL[2,]|runL[1,]<min_rep, NA, runL[3,])
cases <- na.omit(runL[3,])
if(length(cases)>0) {
for(i in rev(cases)) {
runL[4,1:i][!is.na(runL[4,1:i])] <- i
}
for(i in cases) {
runL[4,i:lngth][!is.na(runL[4,i:lngth])] <- i
}
} else { runL[4,] <- NA }
return(rep(runL[4,],runL[1,]))
}
呼叫功能:
vec = c(rep(FALSE, 2), rep(TRUE, 3), rep(FALSE, 2), rep(TRUE, 5),
FALSE, FALSE, rep(TRUE, 6), FALSE, FALSE, FALSE)
cbind(vec,fun1(vec))
vec = rep(T,5)
cbind(vec,fun1(vec))
vec = rep(F,5)
cbind(vec,fun1(vec))
vec = c(rep(F,5),T)
cbind(vec,fun1(vec))
vec = c()
cbind(vec,fun1(vec))
vec = 1:3
cbind(vec,fun1(vec))
答案 2 :(得分:0)
如果OP严格不希望使用data.table
解决方案,那么我可以删除这篇文章。
这是一种可能的data.table
方法:
#aggregate the dataset by bool and rleid
agg <- DT[, .(rn=.GRP, N=.N), by=.(bool, seq=rleid(bool))]
#extract all the TRUE sequences with length >= 4
true4s <- agg[(bool) & N >= 4L]
#for rows that are FALSE
agg[(!bool), expOut := {
prev <- NA
#find the previous sequence of TRUEs by using data.table non-equi join
#(a rolling join will work too here)
#in addition, do the match in reverse so that we can fill NA with prev value
ans <- true4s[.SD[order(-rn)], {
if (.N > 0L) {
prev <- seq[.N]
}
prev
#for each row in i (see ?data.table for i argument and also ?.EACHI)
#non equi join where earlier row in x to be join with later row in i
}, by=.EACHI, on=.(rn<rn)]$V1
#for the rolling version
#}, by=.EACHI, on=.(rn), roll=Inf]$V1
rev(ans)
}]
#add expected output to original dataset
DT[, expected_output := inverse.rle(list(values=agg$expOut, lengths=agg$N))]
输出:
bool expected_output
1: FALSE 4
2: FALSE 4
3: TRUE NA
4: TRUE NA
5: TRUE NA
6: FALSE 4
7: FALSE 4
8: TRUE NA
9: TRUE NA
10: TRUE NA
11: TRUE NA
12: TRUE NA
13: FALSE 4
14: FALSE 4
15: TRUE NA
16: TRUE NA
17: TRUE NA
18: TRUE NA
19: TRUE NA
20: TRUE NA
21: FALSE 6
22: FALSE 6
23: FALSE 6
bool expected_output
数据:
library(data.table)
DT <- data.table(bool = c(rep(FALSE, 2), rep(TRUE, 3), rep(FALSE, 2), rep(TRUE, 5),
FALSE, FALSE, rep(TRUE, 6), FALSE, FALSE, FALSE))