我想计算多个日期变量之间的成对平均天数和中位数天数。
我的原始数据df
可能如下所示:
id invitation account_date first_order second_order third_order
1 1/1/2016 1/7/2016 1/20/2016 1/22/2016 NA
2 1/1/2016 1/8/2016 1/22/2016 1/23/2016 1/25/2016
3 1/1/2016 1/5/2016 1/20/2016 2/1/2016 NA
4 1/1/2016 1/2/2016 1/18/2016 2/4/2016 2/6/2016
鉴于我的数据已经正确格式化为日期,通过首先计算成对差异来手动计算日期组合的平均值和中位数差异非常容易,例如:
id inv_to_act act_to_first act_to_sec act_to_third
1 6 13 2 NA
2 7 14 1 2
3 4 15 12 NA
4 1 16 17 2
然后使用基数R:mean(df$act_to_first,na.rm=T)
。
但是,我想在同一数据集的几个数据集或子集上计算这些计算,因此一遍又一遍地执行每个步骤都不可扩展。另外,我非常确定必须有melt
或plyr
解决方案,我还没有想到。
答案 0 :(得分:1)
您可以通过循环对并使用difftime
来计算每对日期之间的日期差异:
combos <- combn(tail(names(df), -1), 2)
diffs <- apply(combos, 2, function(x) {
difftime(df[,x[2]], df[,x[1]], units="days")
})
colnames(diffs) <- paste0(combos[1,], "_TO_", combos[2,])
diffs
# invitation_TO_account_date invitation_TO_first_order invitation_TO_second_order invitation_TO_third_order account_date_TO_first_order
# [1,] 6 19 21 NA 13
# [2,] 7 21 22 24 14
# [3,] 4 19 31 NA 15
# [4,] 1 17 34 36 16
# account_date_TO_second_order account_date_TO_third_order first_order_TO_second_order first_order_TO_third_order second_order_TO_third_order
# [1,] 15 NA 2 NA NA
# [2,] 15 17 1 3 2
# [3,] 27 NA 12 NA NA
# [4,] 33 35 17 19 2
执行该步骤后,您应该能够轻松计算每列的平均值:
colMeans(diffs, na.rm=TRUE)
# invitation_TO_account_date invitation_TO_first_order invitation_TO_second_order invitation_TO_third_order account_date_TO_first_order
# 4.5 19.0 27.0 30.0 14.5
# account_date_TO_second_order account_date_TO_third_order first_order_TO_second_order first_order_TO_third_order second_order_TO_third_order
# 22.5 26.0 8.0 11.0 2.0
拥有这些功能后,您可以将它们放在一个函数中,并将该函数应用于任何输入df
:
meanDateRanges <- function(df) {
combos <- combn(tail(names(df), -1), 2)
diffs <- apply(combos, 2, function(x) {
difftime(df[,x[2]], df[,x[1]], units="days")
})
colnames(diffs) <- paste0(combos[1,], "_TO_", combos[2,])
colMeans(diffs, na.rm=TRUE)
}
您可以在包含meanDateRanges(df)
的输入数据框上或在lapply(df.list, meanDateRanges)
的列表中运行此功能。
数据:
df <- structure(list(id = 1:4, invitation = structure(list(sec = c(0,
0, 0, 0), min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L),
mday = c(1L, 1L, 1L, 1L), mon = c(0L, 0L, 0L, 0L), year = c(116L,
116L, 116L, 116L), wday = c(5L, 5L, 5L, 5L), yday = c(0L,
0L, 0L, 0L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST",
"EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), account_date = structure(list(sec = c(0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(7L,
8L, 5L, 2L), mon = c(0L, 0L, 0L, 0L), year = c(116L, 116L,
116L, 116L), wday = c(4L, 5L, 2L, 6L), yday = c(6L, 7L, 4L,
1L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST", "EST",
"EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), first_order = structure(list(sec = c(0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(20L,
22L, 20L, 18L), mon = c(0L, 0L, 0L, 0L), year = c(116L, 116L,
116L, 116L), wday = c(3L, 5L, 3L, 1L), yday = c(19L, 21L,
19L, 17L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST",
"EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), second_order = structure(list(sec = c(0, 0, 0, 0),
min = c(0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 0L), mday = c(22L,
23L, 1L, 4L), mon = c(0L, 0L, 1L, 1L), year = c(116L, 116L,
116L, 116L), wday = c(5L, 6L, 1L, 4L), yday = c(21L, 22L,
31L, 34L), isdst = c(0L, 0L, 0L, 0L), zone = c("EST", "EST",
"EST", "EST"), gmtoff = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon",
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt")), third_order = structure(list(sec = c(NA, 0, NA, 0
), min = c(NA, 0L, NA, 0L), hour = c(NA, 0L, NA, 0L), mday = c(NA,
25L, NA, 6L), mon = c(NA, 0L, NA, 1L), year = c(NA, 116L, NA,
116L), wday = c(NA, 1L, NA, 6L), yday = c(NA, 24L, NA, 36L),
isdst = c(-1L, 0L, -1L, 0L), zone = c("", "EST", "", "EST"
), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_
)), .Names = c("sec", "min", "hour", "mday", "mon", "year",
"wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt",
"POSIXt"))), .Names = c("id", "invitation", "account_date", "first_order",
"second_order", "third_order"), row.names = c(NA, -4L), class = "data.frame")