在唯一ID和条件日期

时间:2018-01-26 19:08:36

标签: r

我有两个df我想加入

第一个是带有员工工作时间表的df,以及日程安排有效的相应日期(生效日期)

>df1 = schedule and effective date
ID      Date        Wrk_Schd
0001    8/16/2002   80.00
0001    2/27/2004   40.00
0001    2/1/2006    50.00
0001    7/1/2017    36.00

第二个是带有工资期和实际工时的df

>df2 = pay periods and actual hours
ID      Date        Wrk_Hrs
0001    9/9/2003    32.00
0001    10/8/2005   35.00   
0001    10/21/2006  35.00
0001    12/21/2007  35.00   
0001    9/9/2012    40.00
0001    10/9/2013   40.00
0001    12/9/2017   36.00
0001    12/21/2017  36.00

我如何加入ID和日期,以便实际工时的df与适用生效日期的工作时间表相符?

请记住df1和df2中的日期并不完全相同。因此,我所寻求的解决方案将根据支付期是否在生效日期之后加入,条件是没有可能适用的另一个生效日期。

期望的结果如下

>df3 
ID      Date        Wrk_Hrs Wrk_Schd
0001    9/9/2003    32.00   80.00
0001    10/8/2005   35.00   40.00
0001    10/21/2006  35.00   50.00
0001    12/21/2007  35.00   50.00
0001    9/9/2012    40.00   50.00
0001    10/9/2013   40.00   50.00
0001    12/9/2017   36.00   36.00
0001    12/21/2017  36.00   36.00

3 个答案:

答案 0 :(得分:4)

一种可能的解决方案是使用dplyrsqldf

# The data
df1 <- read.table(text = "ID      Date        Wrk_Schd
0001    08/16/2003   80.00
0001    02/27/2004   40.00
0001    02/01/2006    50.00
0001    07/01/2017    36.00", header = TRUE, stringsAsFactors = FALSE)

# Change Date column to date type
df1$Date <- as.Date(df1$Date, "%m/%d/%Y")

df2 <- read.table(text = "ID      Date        Wrk_Hrs
0001    09/09/2003    32.00
0001    10/08/2005   35.00   
0001    10/21/2006  35.00
0001    12/21/2007  35.00   
0001    09/09/2012    40.00
0001    10/09/2013   40.00
0001    12/09/2017   36.00
0001    12/21/2017  36.00", header = TRUE, stringsAsFactors = FALSE)

# Change Date column to date type 
df2$Date <- as.Date(df2$Date, "%m/%d/%Y")


library(dplyr)
library(sqldf)
# Use lead function to add a column that show previous day of the next
  schedule date
df1_Mod <- df1 %>%
  arrange(ID, Date) %>%
  group_by(ID) %>%
  mutate(End_Date = lead(Date) - 1)

df1_Mod
#  ID       Date Wrk_Schd   End_Date
#1  1 2003-08-16       80 2004-02-26
#2  1 2004-02-27       40 2006-01-31
#3  1 2006-02-01       50 2017-06-30
#4  1 2017-07-01       36       <NA>

#Join data.frames based on ID and Date between Date and End_Date

df3 <- sqldf("SELECT df2.ID, df2.Date, df2.Wrk_Hrs, df1_Mod.Wrk_Schd 
              FROM df2, df1_Mod
              WHERE df2.ID = df1_Mod.ID AND
              df2.Date >= df1_Mod.Date AND
             (df1_Mod.End_Date IS NULL OR df2.Date <= df1_Mod.End_Date)")

df3
#  ID       Date Wrk_Hrs Wrk_Schd
#1  1 2003-09-09      32       80
#2  1 2005-10-08      35       40
#3  1 2006-10-21      35       50
#4  1 2007-12-21      35       50
#5  1 2012-09-09      40       50
#6  1 2013-10-09      40       50
#7  1 2017-12-09      36       36
#8  1 2017-12-21      36       36

答案 1 :(得分:1)

这段代码可以解决问题。我将df1 $ Date更改为df1 $ start.date,因此代码更清晰。

library(lubridate)
df1 <- data.frame(ID=c(1, 1, 1, 1),
                  start.date=c("8/16/2003", "2/27/2004",
                               "2/1/2006", "7/1/2017"),
                  Wrk_Schd=c(80.00, 40.00, 50.00, 36.00))

df2 <- data.frame(ID=c(1, 1, 1, 1, 1, 1, 1, 1),
                  Date=c("9/9/2003", "10/8/2005",
                         "10/21/2006", "12/21/2007",
                         "9/9/2012", "10/9/2013",
                         "12/9/2017", "12/21/2017"),
                  Wrk_Hrs=c(32.00, 35.00, 35.00, 35.00,
                            40.00, 40.00, 36.00, 36.00))

df1$start.date <- as.Date(df1$start.date, "%m/%d/%Y")
df2$Date <- as.Date(df2$Date, "%m/%d/%Y")

## These lines are just to expand the data to have
## more than one ID so the code can be better tested
## They can be discarding without affecting the code
df1 <- rbind(df1,data.frame(ID=df1$ID+1, start.date=df1$start.date+1, Wrk_Schd=df1$Wrk_Schd+10))
df2 <- rbind(df2,data.frame(ID=df2$ID+1, Date=df2$Date+1, Wrk_Hrs=df2$Wrk_Hrs+1))

## order and set end of periods
df1 <- df1[order(df1$ID,df1$start.date),] #order data
df1$end.date <- c(df1$start.date[-1]-1, today())
df1$end.date[df1$ID!=c(df1$ID[-1],df1$ID[1])] <- today() #set end of periods

##  ID start.date Wrk_Schd   end.date
##1  1 2003-08-16       80 2004-02-26
##2  1 2004-02-27       40 2006-01-31
##3  1 2006-02-01       50 2017-06-30
##4  1 2017-07-01       36 2018-01-27
##5  2 2003-08-17       90 2004-02-27
##6  2 2004-02-28       50 2006-02-01
##7  2 2006-02-02       60 2017-07-01
##8  2 2017-07-02       46 2018-01-27

## Assing Wrk_Schd to each of the jobs
df2$Wrk_Schd <-apply(df2, 1, function(x)(
    df1$Wrk_Schd[which((df1$start.date<x["Date"])&
                       (df1$end.date>x["Date"])&
                       (df1$ID==x["ID"]))]
))

##   ID       Date Wrk_Hrs Wrk_Schd
##1   1 2003-09-09      32       80
##2   1 2005-10-08      35       40
##3   1 2006-10-21      35       50
##4   1 2007-12-21      35       50
##5   1 2012-09-09      40       50
##6   1 2013-10-09      40       50
##7   1 2017-12-09      36       36
##8   1 2017-12-21      36       36
##9   2 2003-09-10      33       90
##10  2 2005-10-09      36       50
##11  2 2006-10-22      36       60
##12  2 2007-12-22      36       60
##13  2 2012-09-10      41       60
##14  2 2013-10-10      41       60
##15  2 2017-12-10      37       46
##16  2 2017-12-22      37       46

我还会建议另一种解决方案,可以为您节省更多步骤:总结属于某个时期的作品。

df1$Total_Wrk_Hrs <- apply(df1,1,function(x)(
    sum(df2$Wrk_Hrs[which((x["start.date"]<df2$Date)&
                          (x["end.date"]>df2$Date)&
                          (x["ID"]==df2$ID))]
        )
))

##  ID start.date Wrk_Schd   end.date Total_Wrk_Hrs
##1  1 2003-08-16       80 2004-02-26         32
##2  1 2004-02-27       40 2006-01-31         35
##3  1 2006-02-01       50 2017-06-30        150
##4  1 2017-07-01       36 2018-01-27         72
##5  2 2003-08-17       90 2004-02-27         33
##6  2 2004-02-28       50 2006-02-01         36
##7  2 2006-02-02       60 2017-07-01        154
##8  2 2017-07-02       46 2018-01-27         74 

最后,我注意到你没有使用日期格式。您应将其转换为能够比较日期。您可以通过在仍然不是日期的每个日期列中运行此代码来执行此操作

df2$Date <- as.Date(df2$Date , format="%m/%d/%Y")
class(df2$Date) #check if is date
#[1] "Date"

此外,您应始终提供可重现的代码。这意味着通过简单地处理代码并运行,我应该能够重现错误。有关详细信息,请查看How to make a great R reproducible example?。通过观察,你会增加(很多)回复的变化。

答案 2 :(得分:0)

您必须创建一个同时包含有效日期和期间日期的新列,并使用它来合并两个dfs,然后如果您想要删除列

 df1$Date=as.Date(df1$Date,format="%m/%d/%Y")
 df2$Date=as.Date(df2$Date,format="%m/%d/%Y")
 s=colSums(outer(df1$Date,df2$Date,`<`))
 df3=df1[s,]
 df3$d=df2$d=paste(df1$Date[s],df2$Date)
 df=merge(df2,df3[-2])
 df[order(df$Date),-2]
 ID       Date Wrk_Hrs Wrk_Schd
1  1 2003-09-09      32       80
2  1 2005-10-08      35       40
3  1 2006-10-21      35       50
4  1 2007-12-21      35       50
5  1 2012-09-09      40       50
6  1 2013-10-09      40       50
7  1 2017-12-09      36       36
8  1 2017-12-21      36       36