按小时和天计算员工人数

时间:2020-06-30 19:29:54

标签: python r excel dataframe data-wrangling

我有员工编号,他们的上班时间以及每天的上班时间。我想按小时计算员工人数。 Excel公式也可以。

我的样本数据如下:

Employee ID   Day      Clockin      Clockout
1             Mon        7:00          15:00
1             Fri        7:00          15:00
2             Wed        8:00          22:00
2             Mon        10:00         18:00
2             Fri        9:00          20:00
3             Mon        7:00          8:00 

我的输出应如下所示:

Timestamp   Mon   Tue   Wed  Thu   Fri 
7:00        2      0     0    0     1
8:00        1      0     1    0     1
9:00        1      0     1    0     2
10:00       2      0     1    0     2
11:00       2      0     1    0     2

我试图对clock inclock out列进行突变,但是没有用。我希望使用R中的解决方案。我尝试了此处Calculate the days per month between checkin and checkout dates in R中提到的解决方案,但似乎不适用于我的情况

输入示例

ID  Day Clockin Clockout
Employee 1  Mon 7:00    15:00
Employee 2  Mon 8:00    15:00
Employee 3  Mon 12:00   14:00
Employee 4  Mon 13:00   20:00
Employee 5  Mon 15:00   22:00
Employee 6  Mon 23:00   23:30

输出示例

Mon Count 
7:00    1
8:00    2
9:00    2
10:00   2
11:00   2
12:00   3
13:00   4
14:00   3
15:00   2
16:00   2
17:00   2
18:00   2
19:00   2
20:00   1
21:00   1
22:00   0
23:00   1

如果您关注计数,那么当有人每天进出时,它都会改变。

3 个答案:

答案 0 :(得分:2)

这比看起来要复杂一些,因为如果员工夜班工作到第二天,我们需要更改午夜后计算他们在场的日期。

为此,我编写了自己的函数,该函数将日期取入,取入和取出,并返回时间序列(1小时间隔)以及与每个间隔关联的日期。见下文;

time.seq <- function(day.i, start.i, end.i, step.i = '1 hour'){
  
  require(lubridate)
  require(DescTools) ## to get the abbreviated weekdays

  start.i <- as.POSIXct(start.i, format="%H:%M")
  end.i <- as.POSIXct(end.i, format="%H:%M")
  
  if (start.i > end.i) { ## accounting for working on the next day (after midnight)
   end.i <- end.i + days(1) 
  }

  out.h <- seq(start.i, end.i , step.i)
  
  ## Going to the next day based on difference in time (unit = days)
  day.abb.ex <- c(day.abb, "Mon")
  
  out.d <- day.abb.ex[which(toupper(day.abb) == day.i) + 
            c(0, cumsum(as.numeric(diff(floor_date(out.h, unit = "day"))/86400)))]
  
  out <- list(DAY = out.d, HOUR = out.h)
  
  return(out)
}

然后使用该函数,并且与其他答案中使用的逻辑类似,我们可以获得每小时的计数。

library(dplyr)
library(tidyr)

df %>% 
  rowwise() %>% 
  mutate(HOUR = list(time.seq(Day, Clockin, Clockout)[["HOUR"]]),
         DAY = list(time.seq(Day, Clockin, Clockout)[["DAY"]])) %>% 
  unnest(c(HOUR, DAY)) %>% 
  count(Day=DAY, Hour = format(HOUR, '%H:%M'), name = "Count") %>%
  pivot_wider(names_from = Day, values_from = Count) 

#> # A tibble: 22 x 8
#>    Hour    Fri   Mon   Sat   Sun   Thu   Tue   Wed
#>    <chr> <int> <int> <int> <int> <int> <int> <int>
#>  1 07:00     2    NA     1    NA     1     1     1
#>  2 08:00     2    NA     1    NA     1     1     1
#>  3 09:00     2    NA     1    NA     1     1     1
#>  4 10:00     4    NA     1    NA     3     3     3
#>  5 11:00     4    NA     1    NA     3     3     3
#>  6 12:00     5     1     1    NA     4     4     4
#>  7 13:00     5     1     1    NA     4     4     4
#>  8 14:00     5     1     1    NA     4     4     4
#>  9 15:00     5     1     1    NA     4     4     4
#> 10 16:00     3     1    NA    NA     3     3     3
#> # ... with 12 more rows

数据

df <- structure(list(Employee.ID = c(462L, 462L, 559L, 559L, 559L, 
559L, 560L, 560L, 560L, 560L, 560L, 715L, 715L, 715L, 715L, 715L, 
791L, 791L, 791L, 791L, 802L, 802L, 802L, 802L), Day = structure(c(2L, 
4L, 7L, 8L, 6L, 2L, 3L, 7L, 8L, 6L, 2L, 3L, 7L, 2L, 4L, 5L, 7L, 
8L, 6L, 2L, 7L, 8L, 6L, 2L), .Label = c("", "FRI", "MON", "SAT", 
"SUN", "THU", "TUE", "WED"), class = "factor"), Clockin = structure(c(5L, 
5L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 
5L, 5L, 5L, 2L, 2L, 2L, 2L), .Label = c("", "10:00", "12:00", 
"20:00", "7:00"), class = "factor"), Clockout = structure(c(2L, 
2L, 4L, 4L, 5L, 4L, 7L, 8L, 7L, 7L, 6L, 10L, 9L, 11L, 9L, 9L, 
2L, 2L, 2L, 2L, 4L, 7L, 3L, 4L), .Label = c("", "15:00", "17:30", 
"18:00", "18:15", "19:45", "20:00", "22:00", "4:00", "4:15", 
"4:45"), class = "factor")), row.names = c(NA, 24L), class = "data.frame")

答案 1 :(得分:1)

如果我们将小时与“ Clockin”一起使用,请在R中获得{Day},“ Clockin”的count并使用{{1} }

pivot_wider

更新

使用新数据

library(dplyr)
library(tidyr)
library(lubridate)
df1 %>% 
   transmute(Day = factor(Day, levels = c("MON", "TUE", "WED", "THU", "FRI", "SAT")),
         Clockin = ymd_hms(Clockin)) %>% 
   count(Day, Clockin) %>% 
   complete(Day, Clockin = seq(min(Clockin), max(Clockin), by = "1 hour"),
          fill = list(n = 0)) %>%
   mutate(Clockin = format(Clockin, "%H:%M")) %>% 
   pivot_wider(names_from = Day, values_from = n)
# A tibble: 4 x 7
#  Clockin   MON   TUE   WED   THU   FRI   SAT
#  <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 07:00       0     0     0     0     1     1
#2 08:00       0     0     0     0     0     0
#3 09:00       0     0     0     0     0     0
#4 10:00       0     1     1     1     1     0

数据

 df2 %>%
    transmute(Day,  Hour = map2(as.POSIXct(Clockin, format = '%H:%M'),
         as.POSIXct(Clockout, format = '%H:%M'), ~ seq(.x, .y, by = '1 hour'))) %>%
    unnest(c(Hour)) %>%
    count(Day, Hour = format(Hour, '%H:%M'))
# A tibble: 17 x 3
#   Day   Hour      n
#   <chr> <chr> <int>
# 1 Mon   07:00     1
# 2 Mon   08:00     2
# 3 Mon   09:00     2
# 4 Mon   10:00     2
# 5 Mon   11:00     2
# 6 Mon   12:00     3
# 7 Mon   13:00     4
# 8 Mon   14:00     4
# 9 Mon   15:00     4
#10 Mon   16:00     2
#11 Mon   17:00     2
#12 Mon   18:00     2
#13 Mon   19:00     2
#14 Mon   20:00     2
#15 Mon   21:00     1
#16 Mon   22:00     1
#17 Mon   23:00     1

答案 2 :(得分:1)

我想在这里分享我的解决方案,以防它对某人有所帮助。该解决方案唯一的区别是,我添加了另外一列positions作为过滤器。如果您的问题与上述问题类似,则可以删除该过滤器。否则,此过滤器有助于按hoursposition id进行排序。

  #loading libraries
    library(lubridate)
    library(readxl)
    library(stringr)
    library(tidyr)
    #read data
    df <- read_excel('data_sample.xlsx',
                     col_types = c('numeric', 'text', 'date', 'guess', 'guess','skip', 'numeric'))
    #convert clocks to float for faster comparisons
    df$`Clock In` = hour(df$`Clock In`) + minute(df$`Clock In`) / 60
    df$`Clock Out` = hour(df$`Clock Out`) + minute(df$`Clock Out`) / 60
    #remove shallow records
    df <- df[!(df$`Clock In` == df$`Clock Out`),]
    #24 hours strings
    hours = lapply(0 : 23, function(i) str_pad(paste(toString(i), ':00', sep=""), 5, pad = '0'))
    #empty presence matrix
    hours_mat = c()
    #fill the presence matrix with ines when exists in work place
    for (r in 1:nrow(df)) {
      lis = c()
      for (i in 0:23) {
        if (i < df[r, 'Clock In'] | i >= df[r, 'Clock Out']){
          lis[i + 1] <- 0
        }
        else {
          lis[i + 1] <- 1
        }
      }
      hours_mat <- rbind(hours_mat, lis)
    }
    #convert  matrix to dataframe
    hours_df = as.data.frame(hours_mat)
    colnames(hours_df) <- hours
    #bind the matrix to the original dataframe
    final_df <- cbind(df, hours_df)
    #aggregate presence count over date in every position
    result <- aggregate(final_df[1:nrow(final_df),7:ncol(final_df)],
                        by=list(Date = final_df$Date, Position = final_df$Position),
                        FUN=sum)
    #factorize dates for converting from wide to long format
    result$Date = factor(result$Date)
    #wide to long format
    long_result <- gather(result, Timestamp, Count, '00:00':'23:00',
                          factor_key = TRUE)
    #long to wide format using dates
    result_wide <- spread(long_result, Date, Count)
    #to select a particular position, uncomment this line :
    #result_wide <- result_wide[result_wide$Position == 'your required position as number']
    #write the final output to "output.csv"
    write.csv(result_wide, 'output.csv')

,示例数据如下-

structure(list(`Employee Number` = c(1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1), Day = c("THU", "FRI", "SAT", "SUN", 
"WED", "THU", "FRI", "SAT", "SUN", "WED", "THU", "THU", "FRI", 
"SAT", "SUN", "WED", "THU", "THU"), Date = structure(c(1577923200, 
1578009600, 1578096000, 1578182400, 1578441600, 1578528000, 1578614400, 
1578700800, 1578787200, 1579046400, 1579132800, 1579132800, 1579219200, 
1579305600, 1579392000, 1579651200, 1579737600, 1579737600), class = c("POSIXct", 
"POSIXt"), tzone = "UTC"), ClockIn = structure(c(-2209021200, 
-2209021200, -2209021200, -2209021200, -2209021200, -2209023000, 
-2209021200, -2209021200, -2209021200, -2209021200, -2209075200, 
-2209021200, -2209021200, -2209021200, -2209021200, -2209021200, 
-2209075200, -2209021200), class = c("POSIXct", "POSIXt"), tzone = "UTC"), 
    `Clock Out` = structure(c(-2208992400, -2208992400, -2208992400, 
    -2208992400, -2208992400, -2208994200, -2208992400, -2208992400, 
    -2208992400, -2208992400, -2209075200, -2208999600, -2208992400, 
    -2208992400, -2208992400, -2208992400, -2209075200, -2208999600
    ), class = c("POSIXct", "POSIXt"), tzone = "UTC"), Department = c(20, 
    20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 
    20, 20), Position = c(35, 35, 35, 35, 35, 35, 35, 35, 35, 
    35, 35, 35, 35, 35, 35, 35, 35, 35)), row.names = c(NA, -18L
), class = c("tbl_df", "tbl", "data.frame"))