我有一个纵向数据集,该数据集每月记录一个人的就业状况,持续45个月。我希望能够创建两个变量以添加到该数据集中: 1)每个人“失业”的总时长 2)失业期数
理想情况下,它也会跳过NA,而不会打断咒语
我创建了一个示例数据集来使事情变得简单:
ID <- c(1:10, 1:10, 1:10)
date <- c("2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01",
"2006-09-01", "2006-09-01", "2006-09-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01",
"2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-11-01",
"2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01",
"2006-11-01", "2006-11-01")
act <- c("Unemployed", "Employment", "Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Unemployed", "Education", "Unemployed", "Unemployed", "Unemployed",
"Education", "Education", "Employment", "Education", "Education", "NA", "Unemployed",
"Unemployed", "NA", "Unemployed", "Education", "Employment", "Employment", "NA", "Education",
"Unemployed")
df <- data.frame(ID, date, act)
df[order(ID),]
ID date act
1 1 2006-09-01 Unemployed
11 1 2006-10-01 Education
21 1 2006-11-01 Unemployed
2 2 2006-09-01 Employment
12 2 2006-10-01 Unemployed
22 2 2006-11-01 Unemployed
3 3 2006-09-01 Education
13 3 2006-10-01 Unemployed
23 3 2006-11-01 NA
4 4 2006-09-01 Education
14 4 2006-10-01 Unemployed
24 4 2006-11-01 Unemployed
5 5 2006-09-01 Education
15 5 2006-10-01 Education
25 5 2006-11-01 Education
6 6 2006-09-01 Education
16 6 2006-10-01 Education
26 6 2006-11-01 Employment
7 7 2006-09-01 Education
17 7 2006-10-01 Employment
27 7 2006-11-01 Employment
8 8 2006-09-01 Education
18 8 2006-10-01 Education
28 8 2006-11-01 NA
9 9 2006-09-01 Education
19 9 2006-10-01 Education
29 9 2006-11-01 Education
10 10 2006-09-01 Unemployed
20 10 2006-10-01 NA
30 10 2006-11-01 Unemployed
我尝试了罗兰(Roland)在Calculate duration in R上提出的解决方案,但是我不确定如何修改它以通过ID给我结果并处理NA。
library(data.table)
setDT(df)
df[, date := as.POSIXct(date, format = "%Y-%m-%d", tz = "GMT")]
glimpse(df)
df$act <- ifelse(df$act == "Unemployed",1,-1)
df[, run := cumsum(c(1, diff(act) != 0))]
df1 <- df[, list(act = unique(act),
duration = difftime(max(date), min(date), unit = "weeks")),
by = run]
df1
run act duration
1: 1 1 0 weeks
2: 2 -1 0 weeks
3: 3 1 0 weeks
4: 4 -1 0 weeks
5: 5 1 0 weeks
6: 6 -1 0 weeks
7: 7 1 0 weeks
8: 8 -1 0 weeks
9: 9 1 0 weeks
10: 10 -1 0 weeks
11: 11 1 0 weeks
我追求的是实现这一目标(持续时间是几个月,但可能是几周或几天):
ID spell_count duration
1 1 2 2
2 2 1 2
3 3 1 1
...
10 10 1 2
任何链接/文献/示例,对此的任何帮助将不胜感激。
谢谢。
答案 0 :(得分:1)
我只使用您的第一个代码块,然后在整个持续时间内,我这样做:
library(data.table)
setDT(df)
df_duration = df[act=="Unemployed",.(duration = .N),by = ID]
失业法的数量有点棘手:
df_spell_count = df[order(ID,date)]
df_spell_count <- df_spell_count[!(is.na(act)|act=="NA")]
df_spell_count[,previous_act := shift(act,1),by = ID]
df_spell_count<-df_spell_count[act =="Unemployed" & (previous_act!="Unemployed" | is.na(previous_act))]
df_spell_count<-df_spell_count[,.(spell_count =.N),by = ID]
如果您想将两者合并,只需:
df_stats <- merge(df_duration,df_spell_count, by = "ID", all.x = TRUE,all.y = TRUE)
请注意,此df不包含那些没有失业期的用户的行。
答案 1 :(得分:0)
使用软件包tidyverse
,您可以按一个变量(或多个变量)分组并非常容易地进行汇总。
在汇总数据之前,我将把列date
强制转换为Date
类,并用实际的缺失值"NA"
替换字符串NA
。
library(tidyverse)
is.na(df$act) <- df$act == "NA"
df$date <- as.Date(df$date)
df %>%
group_by(ID, act) %>%
summarise(spell_count = sum(act == "Unemployed", na.rm = TRUE),
duration = difftime(last(date), first(date), units = "weeks")) %>%
filter(act == "Unemployed") %>%
select(-act)
## A tibble: 5 x 3
## Groups: ID [5]
# ID spell_count duration
# <int> <int> <time>
#1 1 2 8.714286 weeks
#2 2 2 4.428571 weeks
#3 3 1 0.000000 weeks
#4 4 2 4.428571 weeks
#5 10 2 8.714286 weeks
上面的代码将仅给出至少有一个act == "Unemployed"
的行。
如果您希望所有行都使用以下基本R解决方案。
res <- lapply(split(df, df$ID), function(DF){
i <- DF$act == "Unemployed"
if(any(i, na.rm = TRUE))
duration <- difftime(max(DF$date[i], na.rm = TRUE), min(DF$date[i], na.rm = TRUE), units = "weeks")
else
duration <- 0
spell_count <- sum(i, na.rm = TRUE)
data.frame(ID = DF$ID[1], spell_count, duration)
})
res <- do.call(rbind, res)
row.names(res) <- NULL
res
# ID spell_count duration
#1 1 2 8.714286 weeks
#2 2 2 4.428571 weeks
#3 3 1 0.000000 weeks
#4 4 2 4.428571 weeks
#5 5 0 0.000000 weeks
#6 6 0 0.000000 weeks
#7 7 0 0.000000 weeks
#8 8 0 0.000000 weeks
#9 9 0 0.000000 weeks
#10 10 2 8.714286 weeks
答案 2 :(得分:0)
这是使用tidyverse的另一种尝试。有关“咒语”的数据是面板数据的常见转换;在tidyverse方法中,我认为技巧是生成一个拼写变量,例如OP原始代码中的“ run”变量。
# libraries
library(tidyverse)
library(zoo)
library(lubridate)
# example dataset
ID <- c(1:10, 1:10, 1:10)
date <- c("2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01", "2006-09-01",
"2006-09-01", "2006-09-01", "2006-09-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01",
"2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-10-01", "2006-11-01",
"2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01", "2006-11-01",
"2006-11-01", "2006-11-01")
act <- c("Unemployed", "Employment", "Education", "Education", "Education", "Education", "Education",
"Education", "Education", "Unemployed", "Education", "Unemployed", "Unemployed", "Unemployed",
"Education", "Education", "Employment", "Education", "Education", "NA", "Unemployed",
"Unemployed", "NA", "Unemployed", "Education", "Employment", "Employment", "NA", "Education",
"Unemployed")
df <- data.frame(ID, date, act)
df[order(ID),]
# convert types of some variables (in particular use zoo::yearmon instead of date, since these are actually yearmonth combos)
df$act <- as.character(df$act)
df$date <- lubridate::ymd(df$date)
df$yearmon <- zoo::as.yearmon(df$date)
df$act <- ifelse(df$act=='NA',NA,df$act)
# construct "act2", which is act, except when an NA is surrounded by the SAME act before and after, it is replaced with that same act
# e.g. Unemployed NA Unemployed -> Unemployed Unemployed Unemployed
# e.g. Education NA Unemployed -> stays the same
# (see note at the end of this discussion for more details on this)
df <- df %>% arrange(ID,date)
df <- df %>% group_by(ID) %>% mutate(
act2 = ifelse(is.na(act) & (lag(act)==lead(act)), lead(act), act)
)
# create "spell" variable, which is like the "run" variable in the example code
# within ID this identifies the spell that is currently taken place
# --- this is the most important part of the code ---
df <- df %>% group_by(ID) %>% mutate(
spell = cumsum(coalesce(is.na(act2) | act2!=lag(act2),FALSE)) + 1
)
# add yearmonth + 1 month, in order to do duration calculations
# (I'm again exploiting the fact that your data is monthly. if this were not true, this variable could be lead(date), within ID. but then we'd have to figure out how to deal with ends of the panel, where lead(date) is NA)
df$yearmonplusmonth <- df$yearmon + (1/12)
# construct a dataset of ID-spell combinations
spells <- df %>% group_by(ID,spell) %>% summarize(
spelltype = first(act2),
duration = (max(yearmonplusmonth) - min(yearmon))*12
)
# construct a dataset at the ID level, with desired summaries of spells
spellsummary <- spells %>% group_by(ID,spelltype) %>% summarize(
spell_count = n(),
duration = sum(duration)
)
# if there are no spells of a given spelltype, it doesn't appear in spellsummary
# we need to fill out spellsummary with zeroes in ID-spelltype cases where there are no spells:
temp <- expand.grid(ID = unique(spellsummary$ID), spelltype = unique(spellsummary$spelltype))
spellsummary <- full_join(spellsummary,temp,by=c('ID','spelltype'))
spellsummary <- spellsummary %>% mutate_at(vars(spell_count,duration),funs(coalesce(as.numeric(.),0)))
spellsummary <- spellsummary %>% mutate_at(vars(spell_count,duration),funs(round(.,0)))
spellsummary <- spellsummary %>% arrange(ID,spelltype)
# finally, we just want Unemployed spelltype summaries by ID:
spellsummary %>% filter(spelltype=='Unemployed')
# A tibble: 10 x 4
# Groups: ID [10]
# ID spelltype spell_count duration
# <int> <chr> <dbl> <dbl>
# 1 1 Unemployed 2 2
# 2 2 Unemployed 1 2
# 3 3 Unemployed 1 1
# 4 4 Unemployed 1 2
# 5 5 Unemployed 0 0
# 6 6 Unemployed 0 0
# 7 7 Unemployed 0 0
# 8 8 Unemployed 0 0
# 9 9 Unemployed 0 0
# 10 10 Unemployed 1 3
注意:我在最后一行的持续时间中得到3,而不是在OP的期望输出中得到2。原因是我认为Unemp NA Unemp实际上是Unemp Unemp Unemp,无论是spell_count还是持续时间都是如此。 OP希望对于spell_count是这种情况,但在持续时间内不是如此。为此,一种方法可能是使用“ act”变量进行持续时间计算,使用“ act2”变量进行spell_count计算-我将其留给读者。