按时间和ID运行长度序列

时间:2018-02-14 13:09:47

标签: r

以前似乎没有提出这个问题。

我想找到连续6个小时得1分的科目数。 每小时没有对受试者进行评分,因此如果缺少一小时,则小时不连续,并且该6小时的输出应为NA。 分配NA的原因是我们不知道主题在失踪时间内如何得分。此问题可用于计算连续命中数,但仅在主题参与时才计算。

我的数据框如下所示:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)

我试过使用rle函数(我确信它可能)但我无法在小时和ID上进行调整。 输出将是这样的:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

提前谢谢。

我认为我提供的原始数据集太小,无法使解决方案更具普遍性 我刚用这个数据集尝试了代码,发现这会导致错误的结果。

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)

对于新数据集,输出应为:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

3 个答案:

答案 0 :(得分:7)

要获得分组,您可以将当前小时与滞后小时进行比较,以查看它们是“连续”还是相差1个整数,然后取出cumsum。获得分组后,您可以使用简单的ave来获得所需的输出。

library(data.table)
df$six <-  ave(df$A, 
               cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
               FUN = function(x) {
                    if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
                    else return(rep(NA, length(x)))}
               )

df
#   ID hour A six
#1   1    1 0  NA
#2   1    2 1  NA
#3   1    3 0  NA
#4   1    7 1   0
#5   1    8 1   0
#6   1    9 1   0
#7   1   10 1   0
#8   1   11 1   0
#9   1   12 1   1
#10  1   17 0  NA
#11  1   18 0  NA
#12  1   19 0  NA
#13  2    1 1   0
#14  2    2 1   0
#15  2    3 1   0
#16  2    4 1   0
#17  2    5 1   0
#18  2    6 1   1
#19  2    8 1  NA
#20  2    9 1  NA
#21  2   15 1  NA

如果您只想选择一次患者,则会选择最后一个时间段:

df$six_adj <- ave(df$six, df$ID, df$six, FUN = function(x) {
                                                if(all(x==1)) return(c(rep(0, length(x) - 1), 1))
                                                else return(x)}
  )

答案 1 :(得分:7)

以下是使用更新数据集的tidyverse方法:

library(tidyverse)

df %>%
  group_by(ID) %>%
  expand(hour = seq(min(hour), max(hour))) %>%
  left_join(df) %>%
  mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
  group_by(ID, rle) %>%
  mutate(sum = cumsum(A),
         six = ifelse(rle >= 6 & A == 1, 0, NA),
         six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
  filter(!is.na(A)) %>%
  ungroup() %>%
  select(ID, hour, A, six) %>%
  as.data.frame() ->  df_out2

检查请求的输出:

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31) 
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)

all.equal(df, df_out2)
#output
TRUE

旧答案:

df %>%
  mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
  group_by(ID, rle) %>%
  mutate(sum = cumsum(A),
         six = ifelse(rle >= 6 & A == 1, 0, NA),
         six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
  ungroup() %>%
  select(ID, hour, A, six) %>%
  as.data.frame() ->  df_out2

让我们检查结果是否符合要求:

ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)

请求输出df1

all.equal(df1, df_out2)
#output
TRUE   

一些基准测试:

library(microbenchmark)
library(data.table)

akrun <- function(df){
  setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
                               ][,  sixnew :=if(sum(A)>=5)  rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
  i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
  df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
  }

missuse <- function(df){
  df %>%
    mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
    group_by(ID, rle) %>%
    mutate(sum = cumsum(A),
           six = ifelse(rle >= 6 & A == 1, 0, NA),
           six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
    ungroup() %>%
    select(ID, hour, A, six)
}


Mike <- function(df){
  ave(df$A, 
      cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
      FUN = function(x) {
        if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
        else return(rep(NA, length(x)))})
}

microbenchmark(Mike(df),
               akrun(df),
               missuse(df))

#output
Unit: microseconds
        expr       min         lq       mean     median         uq       max neval
    Mike(df)   491.291   575.7115   704.2213   597.7155   629.0295  9578.684   100
   akrun(df)  6568.313  6725.5175  7867.4059  6843.5790  7279.2240 69790.755   100
 missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935   100

迈克H的方式!

答案 2 :(得分:5)

我们可以使用rleid中的data.table。创建一个运行长度为&#39; A&#39;的分组列。 (&#39; GRP&#39)。取一个小时的差异&#39;使用&#39;小时&#39;的下一个值,检查它是否等于1并乘以&#39; A&#39;创造'Anew&#39;。按照“Anew”的游程ID进行分组。和&#39;&#39;,if sum&#39; A&#39;大于特定值,用0和1复制,否则返回NA。在最后阶段,通过创建索引(&#39; i1&#39;)将一些溢出NAs指定为0

library(data.table)
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
     ][,  sixnew :=if(sum(A)>=5)  rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
#    ID hour A six sixnew
# 1:  1    1 0  NA     NA
# 2:  1    2 1  NA     NA
# 3:  1    3 0  NA     NA
# 4:  1    7 1   0      0
# 5:  1    8 1   0      0
# 6:  1    9 1   0      0
# 7:  1   10 1   0      0
# 8:  1   11 1   0      0
# 9:  1   12 1   1      1
#10:  1   17 0  NA     NA
#11:  1   18 0  NA     NA
#12:  1   19 0  NA     NA
#13:  2    1 1   0      0
#14:  2    2 1   0      0
#15:  2    3 1   0      0
#16:  2    4 1   0      0
#17:  2    5 1   0      0
#18:  2    6 1   1      1
#19:  2    8 1  NA     NA
#20:  2    9 1  NA     NA
#21:  2   15 1  NA     NA

或稍微更紧凑的选项

i1 <- setDT(df)[, if(sum(A)>= 5)  .I[.N] , rleid(c(TRUE,  diff(hour)==1), A)]$V1
df[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
df
#    ID hour A six sixnew
# 1:  1    1 0  NA     NA
# 2:  1    2 1  NA     NA
# 3:  1    3 0  NA     NA
# 4:  1    7 1   0      0
# 5:  1    8 1   0      0
# 6:  1    9 1   0      0
# 7:  1   10 1   0      0
# 8:  1   11 1   0      0
# 9:  1   12 1   1      1
#10:  1   17 0  NA     NA
#11:  1   18 0  NA     NA
#12:  1   19 0  NA     NA
#13:  2    1 1   0      0
#14:  2    2 1   0      0
#15:  2    3 1   0      0
#16:  2    4 1   0      0
#17:  2    5 1   0      0
#18:  2    6 1   1      1
#19:  2    8 1  NA     NA
#20:  2    9 1  NA     NA
#21:  2   15 1  NA     NA

基准

创建了稍大的数据集并测试了解决方案

-data

ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)

ID <-  rep(1:5000, rep(c(12, 9), 2500))
A <- rep(A, 2500)
hour <- rep(hour, 2500)
dftest <- data.frame(ID, hour, A)

-functions

akrun <- function(df){
  df1 <- copy(df)
  setDT(df1)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
                               ][,  sixnew :=if(sum(A)>=5)  rep(c(0, 1), c(.N-1, 1))
     else NA_real_,.(rleid(Anew), grp)]
  i1 <- df1[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
  df1[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
  }

akrun2 <- function(df) {
      df1 <- copy(df)
      i1 <- setDT(df1)[, if(sum(A)>= 5)  .I[.N] , rleid(c(TRUE,  diff(hour)==1), A)]$V1
      df1[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
  }


missuse <- function(df){
  df %>%
    mutate(rle =  rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
    group_by(ID, rle) %>%
    mutate(sum = cumsum(A),
           six = ifelse(rle >= 6 & A == 1, 0, NA),
           six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
    ungroup() %>%
    select(ID, hour, A, six)
}


Mike <- function(df){
  ave(df$A, 
      cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)), 
      FUN = function(x) {
        if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
        else return(rep(NA, length(x)))})
}

-benchmark

microbenchmark(Mike(dftest),
               akrun(dftest),
               akrun2(dftest),
               missuse(dftest),  
                times = 10L, unit = 'relative')

-output

#Unit: relative
#            expr       min        lq      mean   median        uq       max neval cld
#    Mike(dftest)  1.682794  1.754494  1.673811  1.68806  1.632765  1.640221    10 a  
#   akrun(dftest) 13.159245 12.950117 12.176965 12.33716 11.856271 11.095228    10  b 
#  akrun2(dftest)  1.000000  1.000000  1.000000  1.00000  1.000000  1.000000    10 a  
# missuse(dftest) 37.899905 36.773837 34.726845 34.87672 33.155939 30.665840    10   c