Question

此数据表显示学生每年上学的月份。

DT = data.table(
 Student = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
             2, 2, 2, 2, 2, 2, 2, 2,
             3, 3, 3, 3, 3, 3, 3, 3),
 Month   = c(1, 2, 3, 5, 6, 7, 8, 11, 12,
             2, 3, 4, 5, 7, 8, 9, 10,
             1, 2, 3, 5, 6, 7, 8, 9))

DT
    Student Month
 1:       1     1
 2:       1     2
 3:       1     3
 4:       1     5
 5:       1     6
 6:       1     7
 7:       1     8
 8:       1    11
 9:       1    12
10:       2     2
11:       2     3
12:       2     4
13:       2     5
14:       2     7
15:       2     8
16:       2     9
17:       2    10
18:       3     1
19:       3     2
20:       3     3
21:       3     5
22:       3     6
23:       3     7
24:       3     8
25:       3     9

我想确定连续三个月的期间（由该期间的第一个月确定）。这是数据表和合格期间的可视化。

       1   2   3   4   5   6   7   8   9   10  11  12


1      *   *   *       *   *   *   *           *   *
       [-------]       [-------]
                           [-------]                           


2          *   *   *   *       *   *   *   *
           [-------]           [-------]
               [-------]           [-------]


3      *   *   *       *   *   *   *   *      
       [-------]       [-------]
                           [-------]
                               [-------]

所需的输出：

id   First_month_in_the_period 

1    1
1    5
1    6
2    2
2    3
2    7
2    8
3    1
3    5
3    6
3    7

正在寻找data.table（或dplyr）解决方案。

Answer 1

使用standard method（cumsum...diff...condition）识别连续值的运行，然后将其与“学生”一起用作分组变量。在每个组中，根据每次运行的时间创建序列，然后添加到第一个月。

DT[ , .(start = if(.N >= 3) Month[1] + 0:(.N - 3)),
    by = .(Student, r = cumsum(c(1L, diff(Month) > 1)))]
#     Student r start
#  1:       1 1     1
#  2:       1 2     5
#  3:       1 2     6
#  4:       2 3     2
#  5:       2 3     3
#  6:       2 4     7
#  7:       2 4     8
#  8:       3 4     1
#  9:       3 5     5
# 10:       3 5     6
# 11:       3 5     7

等效的dplyr替代项：

DT %>% 
  group_by(Student, r = cumsum(c(1L, diff(Month) > 1))) %>%
  summarise(list(data.frame(start = if(n() >= 3) Month[1] + 0:(n() - 3)))) %>%
  tidyr::unnest()

# # A tibble: 11 x 3
# # Groups:   Student [3]
#       Student     r start
#         <dbl> <int> <dbl>
#     1       1     1     1
#     2       1     2     5
#     3       1     2     6
#     4       2     3     2
#     5       2     3     3
#     6       2     4     7
#     7       2     4     8
#     8       3     4     1
#     9       3     5     5
#    10       3     5     6
#    11       3     5     7

Answer 2

使用tidyverse的解决方案。

library(tidyverse)
library(data.table)

DT2 <- DT %>%
  arrange(Student, Month) %>%
  group_by(Student) %>%
  # Create sequence of 3
  mutate(Seq = map(Month, ~seq.int(.x, .x + 2L))) %>%
  # Create a flag to show if the sequence match completely with the Month column 
  mutate(Flag = map_lgl(Seq, ~all(.x %in% Month))) %>%
  # Filter the Flag for TRUE
  filter(Flag) %>%
  # Remove columns
  select(-Seq, -Flag) %>%
  ungroup()

DT2
# # A tibble: 11 x 2
#    Student Month
#      <dbl> <dbl>
#  1       1     1
#  2       1     5
#  3       1     6
#  4       2     2
#  5       2     3
#  6       2     7
#  7       2     8
#  8       3     1
#  9       3     5
# 10       3     6
# 11       3     7

Answer 3

这是一个解决方案，它使用data.table提供的分组，

seqfun <- function(month) {
    n <- length(month)
    tmp <- data.table(a=month[1:(n-2)],b=month[2:(n-1)],c=month[3:n])
    month[which(apply(tmp,1,function(x){all(c(1,1)==diff(x))}))]}

Result <- DT[,seqfun(Month), by=Student]
names(Result) <- c("Student","Month")

> Result
    Student Month
 1:       1     1
 2:       1     5
 3:       1     6
 4:       2     2
 5:       2     3
 6:       2     7
 7:       2     8
 8:       3     1
 9:       3     5
10:       3     6
11:       3     7

基本上，它使用组月份向量，创建3个向量来比较diff，并检查两个diff是否相距1。如果是，则返回原始月份向量的索引。 / p>

一点点细节。假设我们有

month <- c(1,2,3,5,6,7,8,11,12)

，我们计算出tmp data.table（注意：您还可以使用rollapply中的zoo函数来创建类似的表格，我将在非常底部）

当我们将diff跨行使用时，我们得到

> apply(tmp,1,function(x){all(c(1,1)==diff(x))})
[1]  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE

真实值是我们感兴趣的指标。

如上所述，使用zoo库的rollapply，我们可以拥有

> apply(c(1,1)==rollapply(month,width=3,FUN=diff),1,all)
[1]  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE

获取特定学生感兴趣的索引的布尔向量。

Answer 4

这是一个base R解决方案，它创建了可以应用于data.table的函数：

cons3fun<-function(x,n){

              consec.list<-split(x,cumsum(c(1,diff(x)!=1))) #Splits into list based on consecutive numbers

              min.len.seq<-consec.list[which(sapply(consec.list,length)>(n-1))] #Selects only the list elements >= to n

              seq.start<-lapply(min.len.seq,function(i) i[1:(length(i)-(n-1))]) #Extracts the first number of each sequence of n

              return(as.vector(unlist(seq.start))) #Returns result as a vector
}

请注意，此功能将使您可以相当容易地更改要查找的连续数字的数量。在这里，您将使用n=3。然后，您可以使用data.table或dplyr应用此功能。我将使用data.table，因为您使用过。

DT[,cons3fun(Month,3),by=.(Student)]

希望您觉得这很有用。祝你好运！

Answer 5

这是我使用tidyverse的方法：

> as_tibble(DT) %>%
      arrange(Student, Month) %>%
      group_by(Student) %>%
      # create an identifier for the start of the sequence
      mutate(seq_id = ifelse(row_number() == 1 | Month - lag(Month) > 1,
                             letters[row_number()], NA)) %>%
      fill(seq_id) %>%
      # add another grouping level (sequence identifier)
      group_by(Student, seq_id) %>%
      # only keep data with attendance in 3 or more consecutive months 
      filter(length(seq_id) > 2) %>%
      # n consecutive months => n - 2 periods
      slice(1:(n() - 2)) %>%
      # clean up
      ungroup() %>%
      select(Student, Month)
# A tibble: 11 x 2
#   Student Month
#    <dbl> <dbl>
#1       1     1
#2       1     5
#3       1     6
#4       2     2
#5       2     3
#6       2     7
#7       2     8
#8       3     1
#9       3     5
#10      3     6
#11      3     7

Answer 6

另一种data.table方法...

#first, clculate the difference between months, by student.
ans <- DT[, diff := shift( Month, type = "lead" ) - Month ), by = .(Student)]
#then filter rows that are at the start of 2 consecutive differences of 1
#also, drop the temporary diff-column
ans[ diff == 1 & shift( diff, type = "lead" ) == 1,][, diff := NULL][]

瞧

#    Student Month
# 1:       1     1
# 2:       1     5
# 3:       1     6
# 4:       2     2
# 5:       2     3
# 6:       2     7
# 7:       2     8
# 8:       3     1
# 9:       3     5
# 10:      3     6
# 11:      3     7

在组的data.table字段中标识n个连续数字的组

6 个答案: