我想计算两个日期之间的变量的平均值。
这里是数据帧的例子
library(lubridate) #ymd function
day= rep(seq.Date(from=ymd("2020-03-01"),to=ymd("2020-04-15"),by="day"), times=4)
center= rep(c("A", "B", "C", "D"), each=46)
ocupation= as.numeric(round(runif(184,20,40),1))
df <- data.frame(day,center,ocupation)
start <- mdy("03/15/2020","04/12/2020","05/01/2020","02/13/2020")
end <- mdy("03/20/2020","04/28/2020","05/14/2020","03/01/2020")
center<-c("A", "A", "B", "C")
id<-c(1,2,3,4)
patients <- data.frame(id, center,start,end)
显示的患者数据框只是一个样本,原始数据包含超过12.000个IDs
我想从每个ID中获取中心开始日期和结束日期之间的平均职业
答案 0 :(得分:0)
我将创建一个返回一个ID的平均占用率的函数:
mean.occ = function(id, patients, occupency, day, center){
to.select = day > patients[id, "start"] & day < patients[id, "end"] & center == patients[id, "center"]
return(mean(occupency[to.select]))
}
在这里,day > patients[id, "start"] & day < patients[id, "end"] & center == patients[id, "center"]
选择特定ID的开始日期和结束日期之间的占用率值,并与给定的中心相对应。
然后使用sapply
将其应用于每个ID:
mean.occupancies = sapply(patients$id, FUN = mean.occ, patients, ocupation, day, center)
最终可以将结果添加到patients
数据框中:
patients = cbind.data.frame(patients, mean.occupancies)
答案 1 :(得分:0)
您可以使用dplyr
中的tidyverse
软件包来完成此操作。
df <- as_tibble(df)
library(dplyr) # 1.0.0
df %>%
# find only the days in df corresponding to day ranges in patients
filter(day %in% c(seq(patients[1, 3], patients[1, 4], by = "days"),
seq(patients[2, 3], patients[2, 4], by = "days"),
seq(patients[3, 3], patients[3, 4], by = "days"),
seq(patients[4, 3], patients[4, 4], by = "days"))) %>%
# add id column
mutate(id = ifelse(day %in% seq(patients[1, 3], patients[1, 4], by = "days"), patients$id[1],
ifelse(day %in% seq(patients[2, 3], patients[2, 4], by = "days"), patients$id[2],
ifelse(day %in% seq(patients[3, 3], patients[3, 4], by = "days"), patients$id[3], patients$id[4])))) %>%
# group by id
group_by(id) %>%
# find mean occuption for each id
summarise(mean_occupation = mean(ocupation))
# A tibble: 3 x 2
id mean_occupation
<dbl> <dbl>
1 1 29.7
2 2 31.7
3 4 32.2
编辑
带有for
的版本会循环许多id
:
df <- as_tibble(df)
library(dplyr)
# create days vector from patients
days <- list()
for (i in 1:nrow(patients)) {
dates <- seq(patients[i, 3], patients[i, 4], by = "days")
for (j in 1:length(dates)) {
names(dates)[j] <- patients$id[i]
}
days[[i]] <- dates
}
days <- as.Date(unlist(days), origin = "1970-01-01")
# filter df for days
mid <- df %>%
filter(day %in% days)
# create id col (I couldn't do this directly in mutate())
id <- character()
for (i in 1:nrow(mid)) {
id[i] <- names(days)[which(days == mid$day[i])]
}
# bind together and finish
final <- mid %>%
cbind(id) %>% as_tibble() %>%
group_by(id) %>%
summarise(mean_occupation = mean(ocupation))
> final
# A tibble: 3 x 2
id mean_occupation
<chr> <dbl>
1 1 29.7
2 2 31.7
3 4 32.2