我有一个包含两个变量的数据集:个人的日期和服务年限(仅用于制作一个小的可复制示例)。 我需要获得此人开始工作的月份(例如此示例是在1989-06年),考虑到如果该解决方案适用于许多人,则开始工作的月份可能因个人而异。 像这样:
library(data.table)
dt <- structure(list(DATE = c("2009-01", "2009-02", "2009-03", "2009-04",
"2009-05", "2009-06", "2009-07", "2009-08", "2009-09", "2009-10",
"2009-11", "2009-12", "2010-01", "2010-02", "2010-03", "2010-04",
"2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10",
"2010-11", "2010-12", "2011-01", "2011-02", "2011-03", "2011-04",
"2011-05", "2011-06", "2011-07", "2011-08", "2011-09", "2011-10",
"2011-11", "2011-12"), Years_service = c(19, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22),
INITIAL_MONTH = c("1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06")), .Names = c("DATE", "Years_service",
"INITIAL_MONTH"), class = c("data.table", "data.frame"), row.names = c(NA,-36L))
head(dt)
DATE Years_service INITIAL_MONTH
1: 2009-01 19 1989-06
2: 2009-02 19 1989-06
3: 2009-03 19 1989-06
4: 2009-04 19 1989-06
5: 2009-05 19 1989-06
6: 2009-06 20 1989-06
如何在R中获得它?
答案 0 :(得分:1)
我们可以在Years_service
列中找到第一个变化,并用该索引处对应的DATE
值减去它。
library(dplyr)
library(lubridate)
dt %>%
mutate(inds = which.max(diff(Years_service) != 0) + 1,
init_month = format(as.Date(paste0(DATE[inds], "-01")) -
years(Years_service[inds]), "%Y-%m")) %>%
select(-inds)
# DATE Years_service INITIAL_MONTH init_month
#1 2009-01 19 1989-06 1989-06
#2 2009-02 19 1989-06 1989-06
#3 2009-03 19 1989-06 1989-06
#4 2009-04 19 1989-06 1989-06
#....
您可能想对多个人执行此操作,可以在其中添加group_by
子句
dt %>%
group_by(person) %>%
mutate(inds = which.max(diff(Years_service) != 0) + 1,
init_month = format(as.Date(paste0(DATE[inds], "-01")) -
years(Years_service[inds]), "%Y-%m")) %>%
select(-inds)
编辑
对于最新情况,我们可能需要先arrange
dates
dt1 <- dt[order(-DATE)]
dt1 %>%
mutate(dates = as.Date(paste0(DATE, "-01"))) %>%
arrange(dates) %>%
mutate(inds = which.max(diff(Years_service) != 0) + 1,
init_month = format(dates[inds] - years(Years_service[inds]), "%Y-%m")) %>%
select(-inds)
答案 1 :(得分:1)
Base R解决方案
使用seq
来反算月数
Date
用天(%d
)创建一个新的as.Date
向量(请使用sprintf
函数)dt$Date <- sprintf("%s-01",dt$DATE)
-X months
格式的字符串向量,以在seq
中向后计数dt$Back_step <- sprintf("-%s months",dt$Years_service)
for
循环遍历打印X个月前的日期的行for(i in 1:nrow(dt)){
dt$INITIAL_MONTH[i] <- as.character(seq(as.Date(dt$Date[i],format="%Y-%m-%d"),
length = 2, by = dt$Back_step[i])[2])
}
[2]
表明我们正在获取序列中的第二个值
答案 2 :(得分:0)
还添加一个data.table解决方案。
# Find the initial month
dt1 <- dt[order(DATE)]
dt1[, diff:=Years_service - shift(Years_service)]
dt2 <- dt1[diff==1, head(.SD, 1)]
# calculate the year
dt2[, init_month:=paste0(as.numeric(substr(DATE, 1, 4))-Years_service, '-', substr(DATE, 6, 7))]
# write back to the original data.table
init_mon <- dt2$init_month[1]
dt <- dt[, init_month:=init_mon]
如果数据中有多个人:
library(data.table)
dt <- structure(list(PERSON = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
DATE = c("2009-01", "2009-02", "2009-03", "2009-04",
"2009-05", "2009-06", "2009-07", "2009-08", "2009-09", "2009-10",
"2009-11", "2009-12", "2010-01", "2010-02", "2010-03", "2010-04",
"2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10",
"2010-11", "2010-12", "2011-01", "2011-02", "2011-03", "2011-04",
"2011-05", "2011-06", "2011-07", "2011-08", "2011-09", "2011-10",
"2011-11", "2011-12", "2009-01", "2009-02", "2009-03", "2009-04",
"2009-05", "2009-06", "2009-07", "2009-08", "2009-09", "2009-10",
"2009-11", "2009-12", "2010-01", "2010-02", "2010-03", "2010-04",
"2010-05", "2010-06", "2010-07", "2010-08", "2010-09", "2010-10",
"2010-11", "2010-12", "2011-01", "2011-02", "2011-03", "2011-04",
"2011-05", "2011-06", "2011-07", "2011-08", "2011-09", "2011-10",
"2011-11", "2011-12"), Years_service = c(19, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 19, 19, 19, 19, 19,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22),
INITIAL_MONTH = c("1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06", "1989-06", "1989-06", "1989-06", "1989-06",
"1989-06", "1989-06")), .Names = c("PERSON", "DATE", "Years_service",
"INITIAL_MONTH"), class = c("data.table", "data.frame"), row.names = c(NA,-36L))
head(dt)
# PERSON DATE Years_service INITIAL_MONTH
# 1 2009-01 19 1989-06
# 1 2009-02 19 1989-06
# 1 2009-03 19 1989-06
# 1 2009-04 19 1989-06
# 1 2009-05 19 1989-06
# 1 2009-06 20 1989-06
在计算中添加分组依据
dt1 <- dt[order(PERSON, DATE)]
dt1[, diff:=Years_service - shift(Years_service), by="PERSON"]
dt2 <- dt1[diff==1, head(.SD, 1), by="PERSON"]
dt2[, init_month:=paste0(as.numeric(substr(DATE, 1, 4))-Years_service, '-', substr(DATE, 6, 7))]
dt <- merge(dt, dt2[, list(PERSON, init_month)], on=c("PERSON"), all.x=TRUE)