查找最后一个序列集和增量计数变量

时间:2017-06-04 14:00:09

标签: r loops dplyr sequence increment

我的数据如下(人员期间文件),其中hldid代表唯一标识符,variable代表时间paid是虚拟向量感兴趣

   hldid variable paid
1      1        1    0
2      1        2    0
3      1        3    0
4      1        4    1
5      1        5    1
6      1        6    0
7      1        7    1
8      1        8    1
9      1        9    0
10     1       10    0
11     2        1    0
12     2        2    0
13     2        3    1
14     2        4    1
15     2        5    0
16     2        6    1
17     2        7    0
18     2        8    0
19     2        9    0
20     2       10    0

我想要达到的目的是:

  hldid variable  paid  last wwork2
 1     1        1     0     0      0
 2     1        2     0     0      0
 3     1        3     0     0      0
 4     1        4     1     0      0
 5     1        5     1     0      0
 6     1        6     0     0     -2
 7     1        7     1     0     -1
 8     1        8     1     1      0
 9     1        9     0     0      1
10     1       10     0     0      2
11     2        1     0     0      0
12     2        2     0     0      0
13     2        3     1     0      0
14     2        4     1     0     -2
15     2        5     0     0     -1
16     2        6     1     1      0
17     2        7     0     0      1
18     2        8     0     0      2
19     2        9     0     0      0
20     2       10     0     0      0

我想创建一个向量,(1)为每个paid找到hldid的最新一集,然后(2)减少/增加2集之前的剧集和最后一集之后的2集。 paid

到目前为止,这就是我所做的。

  1. 查找paid
  2. 的最后一集

    这里复杂的是,付费不是一个连续的序列。例如hldid == 1在第6集停止支付,在第7集再次开始,最后一集在第8集。

    所以我的想法是将所有paid == 1分组,计算剧集的数量,然后将其合并。但是,我不确定这是最有效的策略。

    ddw = dta %>% filter(paid == 1)
    ddw$work = 0
    
    for(i in 2:nrow(ddw)){
      if(ddw$hldid[i] == ddw$hldid[i-1] & 
         ddw$paid[i] == 1){
        ddw$work[i] <- ddw$work[i-1] + 1 
     }
    }
    
    ddf = merge(dta, ddw, by = c('hldid', 'variable', 'paid'), all = T)
    

    然后,我找到最后一集

    ddw2 = ddf %>% group_by(hldid) %>% mutate(end_work = ifelse(work == max(work, na.rm = T), variable, 0)) 
    

    最后我创建了一个虚拟表示最终paid剧集的位置

    ddw2$end_work[is.na(ddw2$end_work)] <- 0
    ddw2 = ddw2 %>% group_by(hldid) %>% mutate(wwork = ifelse(end_work == variable, 1, 0))
    
    1. 递增/递减
    2. 现在,从这里开始,我不知道如何在最后一集之前和之后递增/递减。到目前为止,我只能想出这个:

      df = ddw2
      df$wwork2 = 0
      
      for(i in 2:nrow(df)){
        if(df$hldid[i] == df$hldid[i-1] & 
           df$wwork[i] == 1){
          df$wwork2[i-1] <- 1; df$wwork2[i] <- 1; df$wwork2[i+1] <- 1
        }
      }
      

      数据

      dta = rbind(c(1,1,0), 
            c(1,2,0), 
            c(1,3,0), 
            c(1,4,1), 
            c(1,5,1), 
            c(1,6,0), 
            c(1,7,1), 
            c(1,8,1), 
            c(1,9,0), 
            c(1,10,0), 
            c(2,1,0), 
            c(2,2,0), 
            c(2,3,1), 
            c(2,4,1), 
            c(2,5,0), 
            c(2,6,1), 
            c(2,7,0), 
            c(2,8,0), 
            c(2,9,0), 
            c(2,10,0)) 
      
      colnames(dta) = c('hldid', 'variable', 'paid')
      dta = as.data.frame(dta)
      
      library(dplyr)
      

2 个答案:

答案 0 :(得分:2)

使用dplyr,按hldid分组,然后将end_work定义为variablepaid==1的最大值之间的差异,然后插入0值大于2 ......

library(dplyr)
dta2 <- dta %>% group_by(hldid) %>% 
                mutate(last=as.numeric(variable==max(variable[paid==1]))) %>%
                mutate(end_work=variable-max(variable[paid==1])) %>%
                mutate(end_work=replace(end_work,abs(end_work)>2,0))

dta2
   hldid variable  paid  last end_work
   <dbl>    <dbl> <dbl> <dbl>    <dbl>
 1     1        1     0     0        0
 2     1        2     0     0        0
 3     1        3     0     0        0
 4     1        4     1     0        0
 5     1        5     1     0        0
 6     1        6     0     0       -2
 7     1        7     1     0       -1
 8     1        8     1     1        0
 9     1        9     0     0        1
10     1       10     0     0        2
11     2        1     0     0        0
12     2        2     0     0        0
13     2        3     1     0        0
14     2        4     1     0       -2
15     2        5     0     0       -1
16     2        6     1     1        0
17     2        7     0     0        1
18     2        8     0     0        2
19     2        9     0     0        0
20     2       10     0     0        0

id的工作结束可以通过

汇总
end_w <- dta %>% group_by(hldid) %>% summarise(end_episode=max(variable[paid==1]))

end_w
  hldid end_episode
  <dbl>       <dbl>
1     1           8
2     2           6

答案 1 :(得分:1)

我们可以尝试data.table

library(data.table)
setDT(dta)[,  c('last', 'wwork2') := {
       i1 <- which.max(cumsum(paid))
       i2 <- seq_len(.N) - i1
     .(as.integer(seq_len(.N) ==i1), i2*(abs(i2) <=2))
         },  by = hldid]

df1
#     hldid variable paid last wwork2
# 1:     1        1    0    0      0
# 2:     1        2    0    0      0
# 3:     1        3    0    0      0
# 4:     1        4    1    0      0
# 5:     1        5    1    0      0
# 6:     1        6    0    0     -2
# 7:     1        7    1    0     -1
# 8:     1        8    1    1      0
# 9:     1        9    0    0      1
#10:     1       10    0    0      2
#11:     2        1    0    0      0
#12:     2        2    0    0      0
#13:     2        3    1    0      0
#14:     2        4    1    0     -2
#15:     2        5    0    0     -1
#16:     2        6    1    1      0
#17:     2        7    0    0      1
#18:     2        8    0    0      2
#19:     2        9    0    0      0
#20:     2       10    0    0      0