具有中断的增量序列

时间:2018-05-14 06:48:33

标签: r dplyr data.table run-length-encoding

我有一个重复序列为TRUE的数据集,我希望根据某些条件(id)和序列的增量值进行标记。 FALSE打破TRUE s的序列,并且第一个FALSE打破任何给定的TRUE序列应该包含在该序列中。 FALSE之间的连续TRUE s无关紧要,标记为0.

例如:

> test
   id logical sequence
1   1    TRUE        1
2   1    TRUE        1
3   1   FALSE        1
4   1    TRUE        2
5   1    TRUE        2
6   1   FALSE        2
7   1    TRUE        3
8   2    TRUE        1
9   2    TRUE        1
10  2    TRUE        1
11  2   FALSE        1
12  2    TRUE        2
13  2    TRUE        2
14  2    TRUE        2
15  3   FALSE        0
16  3   FALSE        0
17  3   FALSE        0
18  3    TRUE        1
19  3   FALSE        1
20  3    TRUE        2
21  3   FALSE        2
22  3   FALSE        0
23  3   FALSE        0
24  3   FALSE        0
25  3    TRUE        3

等等。我考虑使用生成

rle()
> rle(test$logical)
Run Length Encoding
  lengths: int [1:13] 2 1 2 1 4 1 3 3 1 1 ...
  values : logi [1:13] TRUE FALSE TRUE FALSE TRUE FALSE ...

但我不知道如何将这个映射回数据框。有关如何解决这个问题的任何建议?

以下是示例数据:

> dput(test)
structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 
2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), logical = c(TRUE, TRUE, 
FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, 
TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, 
FALSE, FALSE, TRUE)), .Names = c("id", "logical"), class = "data.frame", row.names = c(NA, 
-25L))

4 个答案:

答案 0 :(得分:5)

data.table解决方案:

# load the 'data.table'-package & convert 'test' to a data.table with 'setDT'
library(data.table)
setDT(test)

# calculate the new sequence
test[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id
     ][new_seq != 0, new_seq := rleid(new_seq), by = id][]

给出:

    id logical new_seq
 1:  1    TRUE       1
 2:  1    TRUE       1
 3:  1   FALSE       1
 4:  1    TRUE       2
 5:  1    TRUE       2
 6:  1   FALSE       2
 7:  1    TRUE       3
 8:  2    TRUE       1
 9:  2    TRUE       1
10:  2    TRUE       1
11:  2   FALSE       1
12:  2    TRUE       2
13:  2    TRUE       2
14:  2    TRUE       2
15:  3   FALSE       0
16:  3   FALSE       0
17:  3   FALSE       0
18:  3    TRUE       1
19:  3   FALSE       1
20:  3    TRUE       2
21:  3   FALSE       2
22:  3   FALSE       0
23:  3   FALSE       0
24:  3   FALSE       0
25:  3    TRUE       3

这是做什么的:

  • rleid(logical) - !logical创建一个数字游程长度ID,并为1等于logical
  • 的地方减去FALSE
  • 然后将上一步的结果与!(!logical & !shift(logical, fill = FALSE))的结果相乘,这是TRUE / FALSE向量,用于连续的FALSE值除了第一个一个FALSE - 序列。
  • 最后,我们仅为new_seq不等于0并获得所需结果的行创建新的游程长度ID。

稍微改进的替代方案(正如@jogo在评论中所建议的那样):

test[, new_seq := (rleid(logical) - !logical) * (logical | shift(logical, fill = FALSE)), by = id
     ][new_seq != 0, new_seq := rleid(new_seq), by = id][]

答案 1 :(得分:3)

肯定可以更好地实现makeSeq功能,但这样做有效。

这个使用图书馆data.tablemagrittrdplyr

<强>功能

makeSeq <- function(x) {
    res  <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum
    IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x
    res[IND2F]  <- 0
    res[!IND2F] <- rleidv(res[!IND2F])
    return(res)
}

data.table解决方案

setDT(df)[,yourSEQ:=makeSeq(logical),by="id"]
df

tidyverse粉丝使用

df %>% group_by(id) %>% mutate(yourSEQ = makeSeq(logical)) %>% ungroup

<强>结果

> df
    id logical yourSEQ
 1:  1    TRUE       1
 2:  1    TRUE       1
 3:  1   FALSE       1
 4:  1    TRUE       2
 5:  1    TRUE       2
 6:  1   FALSE       2
 7:  1    TRUE       3
 8:  2    TRUE       1
 9:  2    TRUE       1
10:  2    TRUE       1
11:  2   FALSE       1
12:  2    TRUE       2
13:  2    TRUE       2
14:  2    TRUE       2
15:  3   FALSE       0
16:  3   FALSE       0
17:  3   FALSE       0
18:  3    TRUE       1
19:  3   FALSE       1
20:  3    TRUE       2
21:  3   FALSE       2
22:  3   FALSE       0
23:  3   FALSE       0
24:  3   FALSE       0
25:  3    TRUE       3
    id logical yourSEQ

答案 2 :(得分:2)

您可以使用cumsum作为rle值,然后您必须返回并修复顺序FALSE值。

library(dplyr)

test %>%
  group_by(id) %>%
  mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>% 
  mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L)) %>% 
  print(n = 25)

# # A tibble: 25 x 5
# # Groups:   id [3]
#       id logical sequence sum_rle sequence2
#    <int> <lgl>      <int>   <int>     <int>
#  1     1 TRUE           1       1         1
#  2     1 TRUE           1       1         1
#  3     1 FALSE          1       1         1
#  4     1 TRUE           2       2         2
#  5     1 TRUE           2       2         2
#  6     1 FALSE          2       2         2
#  7     1 TRUE           3       3         3
#  8     2 TRUE           1       1         1
#  9     2 TRUE           1       1         1
# 10     2 TRUE           1       1         1
# 11     2 FALSE          1       1         1
# 12     2 TRUE           2       2         2
# 13     2 TRUE           2       2         2
# 14     2 TRUE           2       2         2
# 15     3 FALSE          0       0         0
# 16     3 FALSE          0       0         0
# 17     3 FALSE          0       0         0
# 18     3 TRUE           1       1         1
# 19     3 FALSE          1       1         1
# 20     3 TRUE           2       2         2
# 21     3 FALSE          2       2         2
# 22     3 FALSE          0       2         0
# 23     3 FALSE          0       2         0
# 24     3 FALSE          0       2         0
# 25     3 TRUE           3       3         3

如果您更喜欢同一件事的真正简洁版......

library(dplyr)

group_by(test, id) %>%
  mutate(sequence = if_else(!logical & !lag(logical), 0L, 
                            with(rle(logical), rep(cumsum(values), lengths)), 
                            missing = 0L))

答案 3 :(得分:2)

不使用dplyrmtd0 <- function() { test %>% group_by(id) %>% mutate(sum_rle = with(rle(logical), rep(cumsum(values), lengths))) %>% mutate(sequence2 = if_else(logical == F & lag(logical) == F, 0L, sum_rle, missing = 0L)) } setDT(test) makeSeq <- function(x) { res <- ifelse(!x&!lag(x,default = F),T,x) %>% {!.} %>% lag(default=T) %>% cumsum IND2F<- ifelse(!x&!lag(x,default = F),T,x) != x res[IND2F] <- 0 res[!IND2F] <- rleidv(res[!IND2F]) return(res) } dt0 <- copy(test) dtmtd0 <- function() { dt0[,yourSEQ:=makeSeq(logical),by="id"] } dt1 <- copy(test) dtmtd1 <- function() { dt1[, new_seq := (rleid(logical) - !logical) * !(!logical & !shift(logical, fill = FALSE)), by = id ][new_seq != 0, new_seq := rleid(new_seq), by = id][] } dt4 <- copy(test) dtmtd2 <- function() { dt4[, sequence := { idx <- cumsum(diff(c(FALSE, logical))==1L) mask <- shift(logical, fill=FALSE) | logical idx * mask }, by=id] } microbenchmark(dplyrmtd0(), dtmtd0(), dtmtd1(), dtmtd2(), times=5L) 中的rle以及一些时间:

Unit: milliseconds
        expr      min       lq     mean   median       uq      max neval
 dplyrmtd0() 375.6089 376.7271 433.1885 380.7428 443.8844 588.9791     5
    dtmtd0() 481.5189 487.1245 492.9527 495.6855 500.1588 500.2759     5
    dtmtd1() 146.0376 147.0163 154.7501 152.7157 154.2976 173.6831     5
    dtmtd2() 106.3401 107.7728 112.7580 108.5239 119.4398 121.7131     5

定时:

library(data.table)
library(dplyr)
library(microbenchmark)
M <- 1e6
test <- data.frame(id=sample(LETTERS, M, replace=TRUE) ,
    logical=sample(c(TRUE, FALSE), M, replace=TRUE))
test <- test[order(test$id),]

数据:

select (SUBSTRING(LastName + '22222', 2, 1) +
        SUBSTRING(LastName + '22222', 3, 1) +
        SUBSTRING(LastName + '22222', 5, 1)  +
        SUBSTRING(FirstName + '22222', 2, 1) +
        SUBSTRING(FirstName + '22222', 3, 1) +
        replace(convert(varchar(255), DateOfBirth, 101), '/', '') +
        (CASE WHEN GenderID IN ('1', '2') THEN GenderID ELSE '9' END)
       )
from Client;