在R中按部门查找员工流失

时间:2019-03-20 08:43:08

标签: r dplyr

我正在寻找每个部门每月的员工流失率。我的数据包括员工ID,雇用日期,部门,离职日期和HR_Status。

enter image description here

我想按部门查找月度营业额。

杠杆=从TermDate获取的当月计数

特定月份的营业额=当月离职人数/ AVG(上个月和当月的行数)

以前我问过这个问题,有人在这里回答过,但这不是按部门划分的。

Finding Month on Month Turnover

复制

structure(list(TerminationDate = structure(c(16921, 16921, 12814, 
13028, 15392, 15160, 15186, NA, 17135, 12788, 14491, NA, 15166, 
15126, 15113, 17060, 13283, 12916, NA, 17905, 15611, 17135, 13299, 
17183, 17256, 12761, 17256, 17256, 14421, 14526, 12892, 17214, 
14526, 14526, 15160, 12863, 12726, 14491, 13010, 16073, 16073, 
13955, 15125, 15317, NA, 15429, 15474, 12843, 15237, 12921), class = "Date"), 
    HireDate = structure(c(759283200, 759283200, 766281600, 773020800, 
    781056000, 781056000, 783216000, 786240000, 787708800, 792201600, 
    792547200, 790732800, 796694400, 802828800, 813715200, 764985600, 
    828316800, 846374400, 848188800, 848361600, 848793600, 850003200, 
    861580800, 867715200, 870134400, 873072000, 875664000, 875664000, 
    876182400, 876700800, 878342400, 878342400, 878515200, 879120000, 
    879724800, 881193600, 881539200, 883612800, 883612800, 883612800, 
    883612800, 883612800, 883612800, 883612800, 883612800, 888710400, 
    888710400, 890697600, 893030400, 893376000), class = c("POSIXct", 
    "POSIXt"), tzone = "UTC"), HrstatusName = c("Resigned", "Resigned", 
    "Resigned", "Resigned", "Resigned", "Resigned", "Resigned", 
    "Regular", "Resigned", "Resigned", "Resigned", "Regular", 
    "Gross Misconduct", "Resigned", "Resigned", "Deceased", "Resigned", 
    "Resigned", "Regular", "Terminated", "Resigned", "Resigned", 
    "Resigned", "Resigned", "Resigned", "Resigned", "Resigned", 
    "Resigned", "Resigned", "Resigned", "Resigned", "Resigned", 
    "Resigned", "Resigned", "Resigned", "Resigned", "Resigned", 
    "Resigned", "Resigned", "Terminated", "Terminated", "Terminated", 
    "Retired", "Resigned", "Regular", "Resigned", "Resigned", 
    "Resigned", "Resigned", "Resigned"), EmployeeId = c("39bab084", 
    "39bab084", "5664d681", "520d0890", "20d65e2d", "eb8a8d88", 
    "1d30178f", "6dec15c8", "f67d57de", "76fb57f8", "3b33f6f3", 
    "a0a2b4e5", "9aa8d595", "dc820f73", "acace7da", "740ee9ec", 
    "35c81bd8", "f075debf", "5602d50b", "0d2d3f55", "5de0aee6", 
    "30cb76f7", "a9af8af5", "a95d601b", "32cc220d", "c476b80a", 
    "90772765", "90772765", "9c79745a", "ec579cf3", "f152ac4b", 
    "00041e9e", "b261e06b", "0efff3b5", "44db7a6c", "63d42ba4", 
    "38fbc1fa", "9960e29c", "48d52953", "051d8858", "051d8858", 
    "f44b3a8d", "3f17e928", "250c1bac", "68c4baa7", "7c3e5ee1", 
    "e7af1cf7", "cb4236d2", "f85f925b", "432da957"), Division = c("a60c5c5c", 
    "a60c5c5c", "3cc0c23b", "7e23b2d7", "3cc0c23b", "3cc0c23b", 
    "3cc0c23b", "eae5d36f", "c3abc225", "3cc0c23b", "7e23b2d7", 
    "eae5d36f", "3cc0c23b", "7e23b2d7", "eae5d36f", "a60c5c5c", 
    "7e23b2d7", "3cc0c23b", "3cc0c23b", "3cc0c23b", "c3abc225", 
    "c3abc225", "c3abc225", "5d980f59", "c3abc225", "eae5d36f", 
    "c3abc225", "c3abc225", "3cc0c23b", "a60c5c5c", "c3abc225", 
    "eae5d36f", "7e23b2d7", "a60c5c5c", "3cc0c23b", "3cc0c23b", 
    "3cc0c23b", "7e23b2d7", "3cc0c23b", "7e23b2d7", "7e23b2d7", 
    "7e23b2d7", "7e23b2d7", "3cc0c23b", "eae5d36f", "c3abc225", 
    "3cc0c23b", "216743cf", "3cc0c23b", "3cc0c23b")), class = c("data.table", 
"data.frame"), row.names = c(NA, -50L), .internal.selfref = <pointer: 0x0000000006261ef0>)

尝试过代码

library(data.table)

df_leavers <- setDT(df)[, `:=` (TermDate = as.Date(as.character(TermDate)),
                                HireDate = as.Date(as.character(HireDate)))]

df_presences <- copy(df_leavers)

df_leavers <- df_leavers[, TermDate := format(TermDate, "%Y-%m")][!is.na(TermDate), (Leavers = .N), , by = TermDate]

df_presences <- df_presences[, maxTerm := max(TermDate, na.rm = T)][
  is.na(TermDate), TermDate := maxTerm][
    , .(YearMonth = format(seq(HireDate, TermDate, by = "month"), "%Y-%m")), by = 1:nrow(df)][
      , (Presences = .N), by = YearMonth]

df_final <- df_leavers[df_presences, on = .(TermDate = YearMonth)]

setnames(df_final, c("YearMonth", "Leavers", "Presences"))

df_final <- df_final[is.na(Leavers), Leavers := 0][order(YearMonth),][, previousMonth := shift(Presences)][
  is.na(previousMonth), previousMonth := 0][, AvgPresences := (Presences + previousMonth) / 2][
    , Turnover := round(Leavers / AvgPresences, 2)][, "previousMonth" := NULL]

它正确地给了我输出,除非部门没有给它

所需的输出

Date         Turnover  Division
2019-01      0.23      XYC
2019-01      0.02      ZYV

1 个答案:

答案 0 :(得分:1)

我不确定这是否是你要的。

我做了一些额外的专栏,所以您可以理解每个步骤。

library(dplyr)
library(zoo) 

df %>%
  mutate(month = as.yearmon(TerminationDate, "%y/%m")) %>%
   group_by(month, Division) %>% 
    tally(name = "leavers") %>% 
      group_by(Division) %>%
       mutate(prevmonth = lag(leavers, order_by = month),
              sum = leavers + prevmonth, 
              turnover = leavers/(sum/2)) %>%
          select(Division, month, leavers, sum, prevmonth, turnover) %>%
            arrange(Division, month)

# A tibble: 36 x 6
# Groups:   Division [7]
   Division month         leavers   sum prevmonth turnover
   <chr>    <S3: yearmon>   <int> <int>     <int>    <dbl>
 1 216743cf Mrz 2005            1    NA        NA   NA    
 2 3cc0c23b Nov 2004            1    NA        NA   NA    
 3 3cc0c23b Jan 2005            2     3         1    1.33 
 4 3cc0c23b Mrz 2005            1     3         2    0.667
 5 3cc0c23b Mai 2005            2     3         1    1.33 
 6 3cc0c23b Aug 2005            1     3         2    0.667
 7 3cc0c23b Jun 2009            1     2         1    1    
 8 3cc0c23b Jul 2011            4     5         1    1.6  
 9 3cc0c23b Sep 2011            1     5         4    0.4  
10 3cc0c23b Dez 2011            1     2         1    1    
# ... with 26 more rows

所以我在2005年1月为3cc0c23b做过:

enter image description here

这似乎很适合您的解释

  

特定月份的营业额=当月离职人数/ AVG(上个月和当月的行数)

但特别是“上个月”部分需要更多信息。您是指数据集中的上个月吗?因为有很多个月没有数据。我的解决方案考虑了数据集中的最后一个月。因此,每个部门的第一个月的营业额以及只有一个月的部门的营业额都为NA。

如果您指的是日历中的前一个月,并且他不在数据集中意味着本月没有离开者,请指明。

更新: 这很奇怪。.在dplyr的CRAN文档中

https://cran.r-project.org/web/packages/dplyr/dplyr.pdf

tally()count()使用参数name =为新列分配名称,这对我有用。

但是在这里: https://www.rdocumentation.org/packages/dplyr/versions/0.7.8/topics/tally

它说:

  

当前无法控制输出变量名称-如果需要更改默认值,则必须自己编写summarise()。

它对您不起作用。这是版本问题吗?我正在使用dplyr_0.8.0.1

但是,让我们这样做:

df %>%
  mutate(month = as.yearmon(TerminationDate, "%y/%m")) %>%
  group_by(month, Division) %>%
  summarise(leavers = n()) %>% 
  group_by(Division) %>%
  mutate(prevmonth = lag(leavers, order_by = month),
         sum = leavers + prevmonth, 
         turnover = leavers/(sum/2)) %>%
  select(Division, month, leavers, sum, prevmonth, turnover) %>%
  arrange(Division, month)