Question

我正在尝试找到一种将新产品与我拥有历史数据的产品相匹配的方法。然后，我将使用预览年份产品的历史数据对新产品进行一些预测。

请考虑以下数据子集：

# A tibble: 13 x 11
   prdct_id prdct_grp_1 prdct_grp_2 prdct_grp_3 prdct_grp_4 Start_season January February March April sales_total
      <dbl> <chr>       <chr>       <chr>       <chr>              <dbl>   <dbl>    <dbl> <dbl> <dbl>       <dbl>
 1     1.00 WUW         SW          BH          B21                 2017    2.00    10.0   5.00  4.00        21.0
 2     2.00 WUW         SW          BK          R21                 2017    7.00     9.00  4.00  5.00        25.0
 3     3.00 MUW         NW          UW          P1                  2018    6.00     8.00 10.0   6.00        32.0
 4     4.00 LNG         KW          LW          L1                  2016    8.00     9.00 12.0   7.00        36.0
 5     5.00 QKQ         MZ          KA          AQ                  2013   10.0      8.67 16.7   8.00        43.3
 6     6.00 MUW         NW          UW          P1                  2019    0        0     0     0            0  
 7     7.00 WUW         SW          BK          R21                 2019    0        0     0     0            0  
 8     8.00 LNG         NW          UW          P2                  2014   15.1      8.67 28.7  11.0         63.4
 9     9.00 QKQ         KW          LW          L2                  2016   16.8      8.67 32.7  12.0         70.1
10    10.0  WUW         MZ          KA          AQ                  2017   18.5      8.67 36.7  13.0         76.8
11    11.0  QKQ         MZ          KA          AQ                  2019    0        0     0     0            0  
12    12.0  WUW         MZ          KA          AQ                  2019    0        0     0     0            0  
13    13.0  MUW         NW          UW          P1                  2019    0        0     0     0            0

prdct_grp代表产品组（例如prdct_grp_1=WUW表示该产品在“女士内衣”中，而prdct_grp_2=SW将指定该产品在“泳装”组中，依此类推））。如果某个产品与prdct_grp中的(1-4)相同，那么我将假定它们的销售额非常相似。

我希望获得以下结果

# A tibble: 3 x 11
  new_prdct_id prdct_grp_1 prdct_grp_2 prdct_grp_3 prdct_grp_4 Start_s January February March April sales_total
  <chr>        <chr>       <chr>       <chr>       <chr>         <dbl>   <dbl>    <dbl> <dbl> <dbl>       <dbl>
1 6~3          MUW         NW          UW          P1             2019    6.00     8.00 10.0   6.00        32.0
2 7~2          WUW         SW          BK          R21            2019    7.00     9.00  4.00  5.00        25.0
3 11~5         QKQ         MZ          KA          AQ             2019   10.0      9.00 17.0   8.00        43.0

我使用tidyverse获得了想要的结果，但结果不是很好。

如果一个产品与一个以上的产品匹配或与另一个具有2019年开始季节的产品匹配是另一个问题。我该如何处理？

谢谢您的帮助。

最佳 A

Answer 1

下面是可能的dplyr解决方案以及详细的注释。请始终通过提供dput()输出或至少一个用于创建数据集的代码段来确保您的问题是可重现的。

# import required package
library(dplyr)

# reproduce your data frame (or at least something similar to it)
# please give more details next time
prdct_df <- data_frame(
  prdct_id = 1:13,
  prdct_grp_1 = c("WUW", "WUW", "MUW", "LNG", "QKQ", "MUW", "WUW", "LNG", "QKQ", "WUW", "QKQ", "WUW", "MUW"),
  prdct_grp_2 = c("SW", "SW", "NW", "KW", "MZ", "NW", "SW", "NW", "KW", "MZ", "MZ", "MZ", "NW"),
  prdct_grp_3 = c("BH", "BK", "UW", "LW", "KA", "UW", "BK", "UW", "LW", "KA", "KA", "KA", "UW"),
  prdct_grp_4 = c("B21", "R21", "P1", "L1", "AQ", "P1", "R21", "P2", "L2", "AQ", "AQ", "AQ", "P1"),
  Start_season = c(2017, 2017, 2018, 2016, 2013, 2019, 2019, 2014, 2016, 2017, 2019, 2019, 2019),
  January = c(2, 7, 6 , 8, 10, 0, 0, 15.1, 16.8, 18.5, 0, 0, 0),
  February = c(10, 9, 8, 9, 8.67, 0, 0, 8.86, 8.67, 8.67, 0, 0, 0),
  March = c(4, 5, 10, 12, 16.7, 0, 0, 28.7, 32.7, 36.7, 0, 0, 0),
  April  = c(4, 5, 6, 7, 8, 0, 0, 11, 12, 13, 0, 0, 0),
  sales_total = c(21, 25, 32, 36, 43.3, 0, 0, 63.4, 70.1, 76.8, 0, 0, 0)
)

# define new season in case you have additional seasons in the furture
new_prdct_seasons <- 2019 # with new seasons: c(2019, 2020, 2012) and so on

# keep the historical and new data separate (optional but clean)
# filter your data to separate new products
new_prdct_df <- prdct_df %>%
  filter(Start_season %in% new_prdct_seasons)
# filter your data to separate old products
old_prdct_df <- prdct_df %>%
  filter(!(Start_season %in% new_prdct_seasons))

# match the new and old products to get the data frame you want
final_df <- old_prdct_df %>% 
  inner_join(
    # only the first 6 columns are needed from new product data frame
    new_prdct_df[1:6], 
    # inner join by product group features
    by = c("prdct_grp_1", "prdct_grp_2", "prdct_grp_3", "prdct_grp_4")
  ) %>%
  # reorder the columns and change their names when necessary
  select(
    new_prdct_id = 12,
    old_prdct_id = 1,
    2:5,
    Start_season = 13,
    7:11
  )

# we obtained the data frame you asked for
# note that we avoided matches among new products by keeping new and old products in distinct data frames
final_df
# # A tibble: 5 x 12
#   new_prdct_id old_prdct_id prdct_grp_1 prdct_grp_2 prdct_grp_3 prdct_grp_4 Start_season January
#          <int>        <int> <chr>       <chr>       <chr>       <chr>              <dbl>   <dbl>
# 1            7            2 WUW         SW          BK          R21                 2019     7  
# 2            6            3 MUW         NW          UW          P1                  2019     6  
# 3           13            3 MUW         NW          UW          P1                  2019     6  
# 4           11            5 QKQ         MZ          KA          AQ                  2019    10  
# 5           12           10 WUW         MZ          KA          AQ                  2019    18.5
# # ... with 4 more variables: February <dbl>, March <dbl>, April <dbl>, sales_total <dbl>

# you can also exclude matches with more than one old product if needed
final_df[-3, ] # this removes the match 13-3 as there is already 6-3

如何使用R对销售数据进行分组？

1 个答案: