Question

我有此数据：

library(dplyr)
glimpse(samp)
Observations: 15
Variables: 6
$ date         <date> 2013-01-04, 2013-01-31, 2013-01-09, 2013-01-20, 2013-01-29, 2013...
$ shop_id      <int> 4, 1, 30, 41, 26, 16, 25, 10, 29, 52, 54, 42, 8, 59, 31
$ item_id      <int> 1904, 17880, 14439, 15010, 10917, 10331, 2751, 1475, 16071, 13901...
$ item_cnt_day <dbl> 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1
$ month        <int> 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3
$ year         <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,...

这只是大型数据集的一个示例，因此在date之间存在跳转。在原始数据中，时间序列的起始日期为2013-01-01，结束于2015-11-30。数据是时间序列。我的目标是计算一个月的lag。问题是一个月的长度不一致（即某些月份有30天，另一些月份有31天）。为了计算延迟，我必须设置一个数字。但是，正如我之前提到的一个月，不可能设置一个固定的数字。有没有一种方法可以计算出滞后月份？

目标变量为item_cnt_day。应计算滚动平均值的滞后。在此示例中，每个月有5天，因此结果应为：库（RcppRoll）库（dplyr）

samp %>%
  mutate(r_mean_5 = lag(roll_meanr(item_cnt_day, 5), 1))
             date shop_id item_id item_cnt_day month year r_mean_5
30717  2013-01-04       4    1904            1     1 2013       NA
43051  2013-01-31       1   17880            1     1 2013       NA
66273  2013-01-09      30   14439            1     1 2013       NA
105068 2013-01-20      41   15010            1     1 2013       NA
23332  2013-01-29      26   10917            1     1 2013       NA
28838  2013-02-22      16   10331            1     2 2013      1.0
40418  2013-02-08      25    2751            2     2 2013      1.0
62219  2013-02-12      10    1475            1     2 2013      1.2
98641  2013-02-16      29   16071            1     2 2013      1.2
21905  2013-02-23      52   13901            2     2 2013      1.2
32219  2013-03-31      54    2972            1     3 2013      1.4
45156  2013-03-17      42   11184            1     3 2013      1.4
69513  2013-03-24       8   19405            1     3 2013      1.2
110206 2013-03-10      59    2255            1     3 2013      1.2
24473  2013-03-07      31   15119            1     3 2013      1.2

这里是dput()。

structure(list(date = structure(c(15709, 15736, 15714, 15725, 
15734, 15758, 15744, 15748, 15752, 15759, 15795, 15781, 15788, 
15774, 15771), class = "Date"), shop_id = c(4L, 1L, 30L, 41L, 
26L, 16L, 25L, 10L, 29L, 52L, 54L, 42L, 8L, 59L, 31L), item_id = c(1904L, 
17880L, 14439L, 15010L, 10917L, 10331L, 2751L, 1475L, 16071L, 
13901L, 2972L, 11184L, 19405L, 2255L, 15119L), item_cnt_day = c(1, 
1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1), month = c(1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), year = c(2013L, 
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
2013L, 2013L, 2013L, 2013L, 2013L)), row.names = c(30717L, 43051L, 
66273L, 105068L, 23332L, 28838L, 40418L, 62219L, 98641L, 21905L, 
32219L, 45156L, 69513L, 110206L, 24473L), class = "data.frame")

Answer 1

我对计算lag并不十分熟悉，但这也许就是您想要的吗？

数据：

df <- structure(list(date = structure(c(15709, 15736, 15714, 15725, 
                                  15734, 15758, 15744, 15748, 15752, 15759, 15795, 15781, 15788, 
                                  15774, 15771), class = "Date"), shop_id = c(4L, 1L, 30L, 41L, 
                                                                              26L, 16L, 25L, 10L, 29L, 52L, 54L, 42L, 8L, 59L, 31L), item_id = c(1904L, 
                                                                                                                                                 17880L, 14439L, 15010L, 10917L, 10331L, 2751L, 1475L, 16071L, 
                                                                                                                                                 13901L, 2972L, 11184L, 19405L, 2255L, 15119L), item_cnt_day = c(1, 
                                                                                                                                                                                                                 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1), month = c(1L, 1L, 
                                                                                                                                                                                                                                                                      1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), year = c(2013L, 
                                                                                                                                                                                                                                                                                                                                    2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 
                                                                                                                                                                                                                                                                                                                                    2013L, 2013L, 2013L, 2013L, 2013L)), row.names = c(30717L, 43051L, 
                                                                                                                                                                                                                                                                                                                                                                                       66273L, 105068L, 23332L, 28838L, 40418L, 62219L, 98641L, 21905L, 
                                                                                                                                                                                                                                                                                                                                                                                       32219L, 45156L, 69513L, 110206L, 24473L), class = "data.frame")

计算：

df %>% 
  dplyr::mutate(days_in_month = lubridate::days_in_month(date)) %>% 
  tidyr::nest(-c(month, days_in_month)) %>% 
  dplyr::mutate(lag = purrr::map2(data, days_in_month, ~ stats::lag(.x$item_cnt_day, .y)))

根据评论进行编辑：

那也许是这样吗？

df %>% 
  tidyr::nest(-month) %>% 
  dplyr::mutate(
    ndays = purrr::map_int(data, nrow),
    lag = purrr::map2_dbl(data, ndays, ~ zoo::rollmean(.x$item_cnt_day, .y))
    )

Answer 2

date类在不同的时间间隔（documentation）中支持seq。

因此您基本上可以做到：

calculate_lag <- function(date) {
  return(seq(date, by = "1 month", length.out = 2)[2])
}


date_column <- as.Date(sapply( _YOUR_DATAFRAME_ , calculate_lag), origin="1970-01-01")

Answer 3

也许是吗？

library(lubridate)

df$lag <- df$date %m-% months(1) 

df$rollmean <- sapply(1:nrow(df), function(x) mean(df[df$date <= df$date[x] & df$date >= df$lag[x], "item_cnt_day" ]))

             date shop_id item_id item_cnt_day month year        lag rollmean
30717  2013-01-04       4    1904            1     1 2013 2012-12-04 1.000000
43051  2013-01-31       1   17880            1     1 2013 2012-12-31 1.000000
66273  2013-01-09      30   14439            1     1 2013 2012-12-09 1.000000
105068 2013-01-20      41   15010            1     1 2013 2012-12-20 1.000000
23332  2013-01-29      26   10917            1     1 2013 2012-12-29 1.000000
28838  2013-02-22      16   10331            1     2 2013 2013-01-22 1.166667
40418  2013-02-08      25    2751            2     2 2013 2013-01-08 1.200000
62219  2013-02-12      10    1475            1     2 2013 2013-01-12 1.200000
98641  2013-02-16      29   16071            1     2 2013 2013-01-16 1.166667
21905  2013-02-23      52   13901            2     2 2013 2013-01-23 1.285714
32219  2013-03-31      54    2972            1     3 2013 2013-02-28 1.000000
45156  2013-03-17      42   11184            1     3 2013 2013-02-17 1.200000
69513  2013-03-24       8   19405            1     3 2013 2013-02-24 1.000000
110206 2013-03-10      59    2255            1     3 2013 2013-02-10 1.166667
24473  2013-03-07      31   15119            1     3 2013 2013-02-07 1.333333

%m-%为每个日期计算一个月前的日期，同时考虑月份的不同长度（31天，30天，28天），并将其放入列lag中。然后在sapply()中，为日期在当前迭代的item_cnt_day和date范围内的所有观察值计算lag的平均值。

因此，每个月有多少个元素或元素的顺序都没有关系。

计算一个月的滞后

3 个答案: