根据特定逻辑计算条件累积值

时间:2020-06-10 18:15:07

标签: r dplyr tidyverse tidyr lubridate

我是R的新手,有一个名为final的数据框作为主要数据集,如下所示

dates_seq_ajay<-as.data.frame((seq(as.Date("2019/11/1"), by = "month", length.out = 6)))
ajay_emp_no <-1
ajay_ramped <-c(0,0,0,0,1,1)
ajay_loans <-c(1,22,17,25,21,23)
name<-"ajay"
data<-cbind(name,ajay_emp_no,dates_seq_ajay,ajay_ramped,ajay_loans)
colnames(data)<-c("name","emp_no","date","Flag","loans")


dates_seq_dv<-as.data.frame((seq(as.Date("2019/11/1"), by = "month", length.out = 4)))
dv_emp_no <-2
dv_flag <-c(0,0,0,0)
dv_loans <-c(2,15,42,1)
name<-"dv"
data1<-cbind(name,dv_emp_no,dates_seq_dv,dv_flag,dv_loans)
colnames(data1)<-c("name","emp_no","date","Flag","loans")



dates_seq_prince<-as.data.frame((seq(as.Date("2020/5/1"), by = "month", length.out = 5)))
prince_emp_no <-3
prince_flag <-c(0,0,0,1,1)
prince_loans <-c(16,31,28,32,23)
name<-"prince"
data2<-cbind(name,prince_emp_no,dates_seq_prince,prince_flag,prince_loans)
colnames(data2)<-c("name","emp_no","date","Flag","loans")

final<-rbind(data,data1,data2)

我的df中有1000名员工,我想为每个员工查找月数,绩效和累计绩效 这样,如果某位员工第一次遇到标志1,则可以在下面的

中根据需要进行计算

如果员工标志为0并且没有标志1,则计算月,绩效和累积绩效,直到有记录为止。

对于每位员工

Month是他在场的月份数,

绩效是每月贷款/总贷款的比例

总贷款是直到第一次 找到该标志的所有贷款的总和,如果标志始终为0,则总贷款是所有贷款的总和

累积绩效是指员工在每一步中累积的贷款总额,直到我们标记1为止

输出如下所示,仅适用于3名员工,但是我需要对所有1000名员工有一个共同的逻辑

enter image description here

1 个答案:

答案 0 :(得分:2)

我们按“名称”分组,通过取“标志”('tmp')的累加总和创建一个临时列,获得“ {Month”的row_number(),然后将“ Performance”除以基于“ tmp”小于2的条件,其中sum为“贷款”的“贷款”,“累积性能”为“性能”的总和。然后,根据情况将这些列中的行替换为NA,并以'tmp'列为条件,并删除'tmp'

library(dplyr) #1.0.0
final %>%
    group_by(name) %>% 
    mutate(tmp = cumsum(Flag), 
           Month = row_number(), 
           Performance= loans/sum(loans[tmp <2]), 
           CumulativePerformance = cumsum(Performance)) %>%
    mutate(across(Month:CumulativePerformance, ~ replace(., tmp > 1, NA))) %>%
    ungroup %>%
    select(-tmp)
# A tibble: 15 x 8
#   name   emp_no date        Flag loans Month Performance CumulativePerformance
#   <chr>   <dbl> <date>     <dbl> <dbl> <int>       <dbl>                 <dbl>
# 1 ajay        1 2019-11-01     0     1     1      0.0116                0.0116
# 2 ajay        1 2019-12-01     0    22     2      0.256                 0.267 
# 3 ajay        1 2020-01-01     0    17     3      0.198                 0.465 
# 4 ajay        1 2020-02-01     0    25     4      0.291                 0.756 
# 5 ajay        1 2020-03-01     1    21     5      0.244                 1     
# 6 ajay        1 2020-04-01     1    23    NA     NA                    NA     
# 7 dv          2 2019-11-01     0     2     1      0.0333                0.0333
# 8 dv          2 2019-12-01     0    15     2      0.25                  0.283 
# 9 dv          2 2020-01-01     0    42     3      0.7                   0.983 
#10 dv          2 2020-02-01     0     1     4      0.0167                1     
#11 prince      3 2020-05-01     0    16     1      0.150                 0.150 
#12 prince      3 2020-06-01     0    31     2      0.290                 0.439 
#13 prince      3 2020-07-01     0    28     3      0.262                 0.701 
#14 prince      3 2020-08-01     1    32     4      0.299                 1.00  
#15 prince      3 2020-09-01     1    23    NA     NA                    NA     

如果我们使用的是dplyr的早期版本,请使用mutate_at代替mutate(across

final %>%
        group_by(name) %>% 
        mutate(tmp = cumsum(Flag), 
               Month = row_number(), 
               Performance= loans/sum(loans[tmp <2]), 
               CumulativePerformance = cumsum(Performance)) %>%
        mutate_at(vars(Month:CumulativePerformance), ~ replace(., tmp > 1, NA)) %>%
        ungroup %>%
        select(-tmp)