我有两个df我想加入
第一个是带有员工工作时间表的df,以及日程安排有效的相应日期(生效日期)
>df1 = schedule and effective date
ID Date Wrk_Schd
0001 8/16/2002 80.00
0001 2/27/2004 40.00
0001 2/1/2006 50.00
0001 7/1/2017 36.00
第二个是带有工资期和实际工时的df
>df2 = pay periods and actual hours
ID Date Wrk_Hrs
0001 9/9/2003 32.00
0001 10/8/2005 35.00
0001 10/21/2006 35.00
0001 12/21/2007 35.00
0001 9/9/2012 40.00
0001 10/9/2013 40.00
0001 12/9/2017 36.00
0001 12/21/2017 36.00
我如何加入ID和日期,以便实际工时的df与适用生效日期的工作时间表相符?
请记住df1和df2中的日期并不完全相同。因此,我所寻求的解决方案将根据支付期是否在生效日期之后加入,条件是没有可能适用的另一个生效日期。
期望的结果如下
>df3
ID Date Wrk_Hrs Wrk_Schd
0001 9/9/2003 32.00 80.00
0001 10/8/2005 35.00 40.00
0001 10/21/2006 35.00 50.00
0001 12/21/2007 35.00 50.00
0001 9/9/2012 40.00 50.00
0001 10/9/2013 40.00 50.00
0001 12/9/2017 36.00 36.00
0001 12/21/2017 36.00 36.00
答案 0 :(得分:4)
一种可能的解决方案是使用dplyr
和sqldf
。
# The data
df1 <- read.table(text = "ID Date Wrk_Schd
0001 08/16/2003 80.00
0001 02/27/2004 40.00
0001 02/01/2006 50.00
0001 07/01/2017 36.00", header = TRUE, stringsAsFactors = FALSE)
# Change Date column to date type
df1$Date <- as.Date(df1$Date, "%m/%d/%Y")
df2 <- read.table(text = "ID Date Wrk_Hrs
0001 09/09/2003 32.00
0001 10/08/2005 35.00
0001 10/21/2006 35.00
0001 12/21/2007 35.00
0001 09/09/2012 40.00
0001 10/09/2013 40.00
0001 12/09/2017 36.00
0001 12/21/2017 36.00", header = TRUE, stringsAsFactors = FALSE)
# Change Date column to date type
df2$Date <- as.Date(df2$Date, "%m/%d/%Y")
library(dplyr)
library(sqldf)
# Use lead function to add a column that show previous day of the next
schedule date
df1_Mod <- df1 %>%
arrange(ID, Date) %>%
group_by(ID) %>%
mutate(End_Date = lead(Date) - 1)
df1_Mod
# ID Date Wrk_Schd End_Date
#1 1 2003-08-16 80 2004-02-26
#2 1 2004-02-27 40 2006-01-31
#3 1 2006-02-01 50 2017-06-30
#4 1 2017-07-01 36 <NA>
#Join data.frames based on ID and Date between Date and End_Date
df3 <- sqldf("SELECT df2.ID, df2.Date, df2.Wrk_Hrs, df1_Mod.Wrk_Schd
FROM df2, df1_Mod
WHERE df2.ID = df1_Mod.ID AND
df2.Date >= df1_Mod.Date AND
(df1_Mod.End_Date IS NULL OR df2.Date <= df1_Mod.End_Date)")
df3
# ID Date Wrk_Hrs Wrk_Schd
#1 1 2003-09-09 32 80
#2 1 2005-10-08 35 40
#3 1 2006-10-21 35 50
#4 1 2007-12-21 35 50
#5 1 2012-09-09 40 50
#6 1 2013-10-09 40 50
#7 1 2017-12-09 36 36
#8 1 2017-12-21 36 36
答案 1 :(得分:1)
这段代码可以解决问题。我将df1 $ Date更改为df1 $ start.date,因此代码更清晰。
library(lubridate)
df1 <- data.frame(ID=c(1, 1, 1, 1),
start.date=c("8/16/2003", "2/27/2004",
"2/1/2006", "7/1/2017"),
Wrk_Schd=c(80.00, 40.00, 50.00, 36.00))
df2 <- data.frame(ID=c(1, 1, 1, 1, 1, 1, 1, 1),
Date=c("9/9/2003", "10/8/2005",
"10/21/2006", "12/21/2007",
"9/9/2012", "10/9/2013",
"12/9/2017", "12/21/2017"),
Wrk_Hrs=c(32.00, 35.00, 35.00, 35.00,
40.00, 40.00, 36.00, 36.00))
df1$start.date <- as.Date(df1$start.date, "%m/%d/%Y")
df2$Date <- as.Date(df2$Date, "%m/%d/%Y")
## These lines are just to expand the data to have
## more than one ID so the code can be better tested
## They can be discarding without affecting the code
df1 <- rbind(df1,data.frame(ID=df1$ID+1, start.date=df1$start.date+1, Wrk_Schd=df1$Wrk_Schd+10))
df2 <- rbind(df2,data.frame(ID=df2$ID+1, Date=df2$Date+1, Wrk_Hrs=df2$Wrk_Hrs+1))
## order and set end of periods
df1 <- df1[order(df1$ID,df1$start.date),] #order data
df1$end.date <- c(df1$start.date[-1]-1, today())
df1$end.date[df1$ID!=c(df1$ID[-1],df1$ID[1])] <- today() #set end of periods
## ID start.date Wrk_Schd end.date
##1 1 2003-08-16 80 2004-02-26
##2 1 2004-02-27 40 2006-01-31
##3 1 2006-02-01 50 2017-06-30
##4 1 2017-07-01 36 2018-01-27
##5 2 2003-08-17 90 2004-02-27
##6 2 2004-02-28 50 2006-02-01
##7 2 2006-02-02 60 2017-07-01
##8 2 2017-07-02 46 2018-01-27
## Assing Wrk_Schd to each of the jobs
df2$Wrk_Schd <-apply(df2, 1, function(x)(
df1$Wrk_Schd[which((df1$start.date<x["Date"])&
(df1$end.date>x["Date"])&
(df1$ID==x["ID"]))]
))
## ID Date Wrk_Hrs Wrk_Schd
##1 1 2003-09-09 32 80
##2 1 2005-10-08 35 40
##3 1 2006-10-21 35 50
##4 1 2007-12-21 35 50
##5 1 2012-09-09 40 50
##6 1 2013-10-09 40 50
##7 1 2017-12-09 36 36
##8 1 2017-12-21 36 36
##9 2 2003-09-10 33 90
##10 2 2005-10-09 36 50
##11 2 2006-10-22 36 60
##12 2 2007-12-22 36 60
##13 2 2012-09-10 41 60
##14 2 2013-10-10 41 60
##15 2 2017-12-10 37 46
##16 2 2017-12-22 37 46
我还会建议另一种解决方案,可以为您节省更多步骤:总结属于某个时期的作品。
df1$Total_Wrk_Hrs <- apply(df1,1,function(x)(
sum(df2$Wrk_Hrs[which((x["start.date"]<df2$Date)&
(x["end.date"]>df2$Date)&
(x["ID"]==df2$ID))]
)
))
## ID start.date Wrk_Schd end.date Total_Wrk_Hrs
##1 1 2003-08-16 80 2004-02-26 32
##2 1 2004-02-27 40 2006-01-31 35
##3 1 2006-02-01 50 2017-06-30 150
##4 1 2017-07-01 36 2018-01-27 72
##5 2 2003-08-17 90 2004-02-27 33
##6 2 2004-02-28 50 2006-02-01 36
##7 2 2006-02-02 60 2017-07-01 154
##8 2 2017-07-02 46 2018-01-27 74
最后,我注意到你没有使用日期格式。您应将其转换为能够比较日期。您可以通过在仍然不是日期的每个日期列中运行此代码来执行此操作
df2$Date <- as.Date(df2$Date , format="%m/%d/%Y")
class(df2$Date) #check if is date
#[1] "Date"
此外,您应始终提供可重现的代码。这意味着通过简单地处理代码并运行,我应该能够重现错误。有关详细信息,请查看How to make a great R reproducible example?。通过观察,你会增加(很多)回复的变化。
答案 2 :(得分:0)
您必须创建一个同时包含有效日期和期间日期的新列,并使用它来合并两个dfs,然后如果您想要删除列
df1$Date=as.Date(df1$Date,format="%m/%d/%Y")
df2$Date=as.Date(df2$Date,format="%m/%d/%Y")
s=colSums(outer(df1$Date,df2$Date,`<`))
df3=df1[s,]
df3$d=df2$d=paste(df1$Date[s],df2$Date)
df=merge(df2,df3[-2])
df[order(df$Date),-2]
ID Date Wrk_Hrs Wrk_Schd
1 1 2003-09-09 32 80
2 1 2005-10-08 35 40
3 1 2006-10-21 35 50
4 1 2007-12-21 35 50
5 1 2012-09-09 40 50
6 1 2013-10-09 40 50
7 1 2017-12-09 36 36
8 1 2017-12-21 36 36