根据另一列的值创建滞后

时间:2019-05-20 19:07:46

标签: r

我有如下数据:

   month shop product sales sales_per_shop
1      1    1       1    10             90
2      1    1       2    20             90
3      1    2       1    40            120
4      1    3       2    50            150
5      2    1       1    10             90
6      2    1       2    20             90
7      2    2       1    40            120
8      2    3       2    50            150
9      3    1       1    10             90
10     3    1       2    20             90
11     3    2       1    40            120
12     3    3       2    50            150

我的目标是为sales和sales_per_shop列创建一个月的滞后时间。

对于销售而言,这没问题,因为每一行都是不同的。

z %>%
  group_by(shop, product) %>%
  mutate(lag_sales_per_shop = lag(sales, 1)) %>%
  head(5)

# A tibble: 5 x 6
# Groups:   shop, product [4]
  month  shop product sales sales_per_shop lag_sales
  <int> <dbl>   <dbl> <dbl>          <dbl>     <dbl>
1     1     1       1    10             90        NA
2     1     1       2    20             90        NA
3     1     2       1    40            120        NA
4     1     3       2    50            150        NA
5     2     1       1    10             90        10

但是,对于sales_per_shop,我无法这样做:

z %>%
  group_by(shop) %>%
  mutate(lag_sales_per_shop = lag(sales_per_shop, 1)) 

# A tibble: 5 x 6
# Groups:   shop [3]
  month  shop product sales sales_per_shop lag_sales_per_shop
  <int> <dbl>   <dbl> <dbl>          <dbl>              <dbl>
1     1     1       1    10             90                 NA
2     1     1       2    20             90                 90
3     1     2       1    40            120                 NA
4     1     3       2    50            150                 NA
5     2     1       1    10             90                 90

如您所见,第一个月仍然有一个值。由于我落后了一个月,因此不应该有任何价值。有可能滞后于另一个值吗?

结果应如下所示:

# A tibble: 12 x 7
# Groups:   shop, product [4]
   month  shop product sales sales_per_shop lag_sales lag_sales_per_shop
   <int> <dbl>   <dbl> <dbl>          <dbl>     <dbl>              <dbl>
 1     1     1       1    10             90        NA                 NA
 2     1     1       2    20             90        NA                 NA
 3     1     2       1    40            120        NA                 NA
 4     1     3       2    50            150        NA                 NA
 5     2     1       1    10             90        10                 90
 6     2     1       2    20             90        20                 90
 7     2     2       1    40            120        40                120
 8     2     3       2    50            150        50                150
 9     3     1       1    10             90        10                 90
10     3     1       2    20             90        20                 90
11     3     2       1    40            120        40                120
12     3     3       2    50            150        50                150

  structure(list(month = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 
3L, 3L, 3L), shop = c(1, 1, 2, 3, 1, 1, 2, 3, 1, 1, 2, 3), product = c(1, 
2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2), sales = c(10, 20, 40, 50, 10, 
20, 40, 50, 10, 20, 40, 50), sales_per_shop = c(90, 90, 120, 
150, 90, 90, 120, 150, 90, 90, 120, 150)), row.names = c(NA, 
-12L), class = "data.frame")

2 个答案:

答案 0 :(得分:1)

您可能需要left_join-

df %>% 
  left_join(
    df %>%
      mutate(month = month + 1) %>% 
      distinct(shop, month, sales_per_shop) %>% 
      rename(lag_sales_per_shop = sales_per_shop),
    by = c("shop", "month")
  )


   month shop product sales sales_per_shop lag_sales_per_shop
1      1    1       1    10             90                 NA
2      1    1       2    20             90                 NA
3      1    2       1    40            120                 NA
4      1    3       2    50            150                 NA
5      2    1       1    10             90                 90
6      2    1       2    20             90                 90
7      2    2       1    40            120                120
8      2    3       2    50            150                150
9      3    1       1    10             90                 90
10     3    1       2    20             90                 90
11     3    2       1    40            120                120
12     3    3       2    50            150                150

答案 1 :(得分:1)

这是带有filterbind_rows的另一个版本

library(dplyr)
z %>% 
  filter(month == first(month)) %>%       
  bind_rows(z %>% 
            filter(month != first(month)) %>%
            mutate(lag_sales = sales, lag_sales_per_shop = sales_per_shop))
#   month shop product sales sales_per_shop lag_sales lag_sales_per_shop
#1      1    1       1    10             90        NA                 NA
#2      1    1       2    20             90        NA                 NA
#3      1    2       1    40            120        NA                 NA
#4      1    3       2    50            150        NA                 NA
#5      2    1       1    10             90        10                 90
#6      2    1       2    20             90        20                 90
#7      2    2       1    40            120        40                120
#8      2    3       2    50            150        50                150
#9      3    1       1    10             90        10                 90
#10     3    1       2    20             90        20                 90
#11     3    2       1    40            120        40                120
#12     3    3       2    50            150        50                150