我正在使用tidyr :: nest从boot
包传递一个grouped_by表到函数boot.ci
和boot
,以计算非参数统计量的均值和置信区间。对于非重叠的组,如下所示,效果很好:
library(dplyr)
library(tidyr)
library(purrr)
library(lubridate)
library(broom)
library(boot)
#toy example
set.seed(1)
Sys.setenv(TZ="America/Chicago")
df <- data.frame(date = mdy("01-01-2018")+ddays(sample(0:364,100,replace = T)),
score = sample(0:10,100,replace = T,prob=c(0.15,0.15,rep(0.15/7,7),0.25,0.3)))
# the statistic of interest
net_promoter_score <- function(data,col_name='score') {
return(
(sum(data[[col_name]]>=9,na.rm=TRUE)-
sum(data[[col_name]]<=6,na.rm=TRUE))/sum(!is.na(data[[col_name]]))*100
)
}
# boot needs to resample the staistic by index
nps_boot <- function(d,i) net_promoter_score(d[i,])
#do NPS confidence intervals by month - this works fine!
by_month = df %>%
mutate(month = lubridate::month(date,label=T,abbr=T)) %>%
nest(-month) %>%
mutate(boots = map(data, ~boot::boot(.x,nps_boot,R=4999)),
CI = map(boots, ~boot::boot.ci(.x,conf=0.9)$bca),
tidied_NPS = map(boots,broom::tidy),
tidied_CI = map(CI,broom::tidy)
) %>%
unnest(tidied_NPS,tidied_CI,.drop=T) %>%
select(month,mean=statistic,CI10=V4,CI90=V5)
by_month %>% head
A tibble: 6 x 4
month mean CI10 CI90
<ord> <dbl> <dbl> <dbl>
1 Apr 0 -100 33.3
2 May 6.67 -46.7 33.3
3 Jul 60 -100 60
4 Nov -20 -80 20
5 Mar -11.1 -66.7 33.3
6 Dec 0 -100 50
但是我想对滑动窗口执行此操作-有点像移动平均线,只不过我想使用其他统计数据进行滑动。我可以使用lapply
来做到这一点,但我想使用tidyverse
。
#do 50-sample sliding window. I would like to solve this with tidyverse
window_size = 50
results = lapply(1:(nrow(df)-window_size), function(x) {
boot_df = df %>% arrange(date) %>% slice(x:(x+window_size-1))
boot = boot::boot(boot_df,nps_boot,R=999)
CI = boot.ci(boot,conf=0.9)$bca[4:5]
return(c(x,mean(boot$t),CI))
})
by_slide = as.data.frame(do.call(rbind, results)) %>%
select(date=V1,mean=V2,CI10=V3,CI90=V4) %>%
mutate(date = mdy("01-01-2018")+ddays((window_size %/% 2)+date))
by_slide %>% head
date mean CI10 CI90
1 2018-01-27 15.40541 -8.00000 38
2 2018-01-28 15.94194 -8.00000 36
3 2018-01-29 15.83383 -8.00000 36
4 2018-01-30 15.24525 -8.00000 38
5 2018-01-31 15.79780 -10.00000 36
6 2018-02-01 15.82583 -10.92218 36
答案 0 :(得分:1)
您可以使用x1 + x2
:
x1 + x2
然后,您可以直接在计算x4
中使用purrr::map_dfr()
:
results <- purrr::map_dfr(1:(nrow(df)-window_size), function(x) {
boot_df = df %>% arrange(date) %>% slice(x:(x+window_size-1))
boot = boot::boot(boot_df,nps_boot,R=999)
CI = boot.ci(boot,conf=0.9)$bca[4:5]
list(date = boot_df$date[1],
mean = mean(boot$t),
ci_lo = CI[1],
ci_hi = CI[2])
})
results
# A tibble: 50 x 4
date mean ci_lo ci_hi
<date> <dbl> <dbl> <dbl>
1 2018-01-05 15.6 -8 38
2 2018-01-09 16.3 -8 36
3 2018-01-22 16.2 -10 36
4 2018-01-23 15.6 -10 36
5 2018-01-26 15.2 -10 36
6 2018-01-31 16.5 -10 36
7 2018-02-06 19.7 -4.75 40
8 2018-02-09 19.5 -8 40
9 2018-02-14 16.3 -10 36
10 2018-02-15 16.1 -10 36
# … with 40 more rows
尽管我承认我不理解在results
持续时间对象中添加by_slide
是如何工作的,但这似乎与您提供的输出无关。但是我假设这是一个语法问题-与您有关如何替换by_slide = results %>%
mutate(date = mdy("01-01-2018") + ddays(window_size %/% 2))
的问题分开。