在R中的函数中使用dplyr,然后在for循环中执行该函数

时间:2018-06-21 15:10:22

标签: r function for-loop dplyr

我有一个数据框,我们称其为df1,看起来像这样:

product_key              month    price     productage

00020e32-8ecd53a64715   201508  65.00000    1
00020e32-8ecd53a64715   201509  65.00000    2
00020e32-8ecd53a64715   201510  65.00000    3
000340b8-60fb50bacac8   201504  55.00000    1
000340b8-60fb50bacac8   201505  55.00000    2
000340b8-60fb50bacac8   201506  53.16667    3
000340b8-60fb50bacac8   201507  27.50000    4
000340b8-60fb50bacac8   201508  27.50000    5
000340b8-60fb50bacac8   201509  27.50000    6
000340b8-60fb50bacac8   201510  27.50000    7
000458f1-9304a2fdb6ae   201506  49.00000    1
000458f1-9304a2fdb6ae   201507  49.00000    2
000458f1-9304a2fdb6ae   201508  49.00000    3
000458f1-9304a2fdb6ae   201509  49.00000    4
000458f1-9304a2fdb6ae   201510  49.00000    5

我想做的是过滤掉数据集中已有1个月的所有产品(例如filter(productage ==1)),然后根据这些商品及其价格创建一个单位价值指数。然后我想对已经存在于数据集中2个月,然后3个月等的产品执行相同的操作...

到目前为止,我一直做着但很费劲的事情:

MONTH 1

df1month1 <- df1 %>%
filter(productage == 1)

每个产品的月平均价格

df1_UVIMONTH1<-df1month1%>%
  group_by(month)%>%
  summarise(aveprice=mean(price))

第1月的UVI 计算UVI价格指数

  df1UVIMONTH1<-df1_UVIMONTH1%>%
  mutate(month=as.numeric(month))%>%
  arrange(month)%>%
  mutate(UVI=(aveprice/lag(aveprice)))%>%
  mutate(UVI=case_when(month==min(month)~1,
                       month!=min(month)~ UVI))%>%
  mutate(chained=cumprod(UVI))

但是,对于数据集中的每个产品年龄(最多26个)和10个不同的数据集执行此操作既冗长又乏味。我正在尝试使此过程更有效,但仍在努力。

我试图创建一个函数:

product_by_age <- function(df1, age){
  filter_by_month <- df1 %>%
    filter(productage %in% age) %>%
    group_by(month) %>%
    summarise(aveprice=mean(price))

  UVI_index <- filter_by_month %>%
    mutate(month=as.numeric(month))%>%
    arrange(month)%>%
    mutate(UVI=(aveprice/lag(aveprice)))%>%
    mutate(UVI=case_when(month==min(month)~1,
                         month!=min(month)~ UVI))%>%
    mutate(chained=cumprod(UVI))
}



df1productage <- data.frame(age = unique(df1$productage), stringsAsFactors = FALSE)

result <- data.frame()
for (i in df1productage:length(df1productage)) {
  sba <- product_by_age(df1, df1productage[i])
  result <- rbind(result, sba)
}

但这对我不起作用。请帮忙!如果有人可以想出更好的方法来解决此问题,请告诉我。我也不介意您也完全重做该功能。

要重新创建示例数据集,您可以使用:

product_key <- c(“00020e32-8ecd53a64715”, “00020e32-8ecd53a64715”, ”00020e32-8ecd53a64715”, “000340b8-60fb50bacac8”, “000340b8-60fb50bacac8”, “000340b8-60fb50bacac8”, “000340b8-60fb50bacac8”, “000340b8-60fb50bacac8”, “000340b8-60fb50bacac8”,   “000340b8-60fb50bacac8”, “000458f1-9304a2fdb6ae”, “000458f1-9304a2fdb6ae”, “000458f1-9304a2fdb6ae”, “000458f1-9304a2fdb6ae”, ”000458f1-9304a2fdb6ae”)
month <- c("201508", "201509", "201510", "201504", "201505", "201506", "201507", "201508", "201509", "201510", "201506", "201507", "201508", "201509", "201510")
price <- c("65", "65", "65", "55", "55", "53.16667", "27.5", "27.5", "27.5", "27.5", "49", "49", "49", "49", "49")
productage <- c("1", "2", "3", "1", "2", "3", "4", "5", "6", "7", "1", "2", "3", "4", "5")
df1 <- data.frame(product_key, month, price, productage)

2 个答案:

答案 0 :(得分:1)

我们需要稍微改变一下循环。假设我们正在遍历“ df1productage”中的行序列,并且将“结果”初始化为空白data.frame,

for(i in seq_len(nrow(df1productage))) {
    result <- rbind(result, product_by_age(df1, df1productage$age[i]))
 }

dim(result)
#[1] 15  4

或使用tidyverse方式

library(tidyverse)
map_df(df1productage %>% 
              pull(age), ~    
                    product_by_age(df1, .x), .id = 'grp')
# A tibble: 15 x 5
#   grp   month aveprice   UVI chained
#   <chr> <dbl>    <dbl> <dbl>   <dbl>
# 1 1         1     55   1       1    
# 2 1         3     49   0.891   0.891
# 3 1         5     65   1.33    1.18 
# 4 2         2     55   1       1    
# 5 2         4     49   0.891   0.891
# 6 2         6     65   1.33    1.18 
# 7 3         3     53.2 1       1    
# 8 3         5     49   0.922   0.922
# 9 3         7     65   1.33    1.22 
#10 4         4     27.5 1       1    
#11 4         6     49   1.78    1.78 
#12 5         5     27.5 1       1    
#13 5         7     49   1.78    1.78 
#14 6         6     27.5 1       1    
#15 7         7     27.5 1       1    

编辑:在map_df

中添加了一个标识符列

答案 1 :(得分:1)

它可以与分组一起使用,而没有新功能!

require(dplyr)

df1%>%
  group_by(month, productage)%>%
  summarise(aveprice=mean(price)) %>% arrange(productage, month) %>%
    group_by(productage)%>%
    mutate(UVI=c(1, aveprice[2:length(aveprice)]/aveprice[1:length(aveprice)-1])) %>%
  mutate(chained=cumprod(UVI))

 ### Group and then regroup. and I have modified your mutate code which was using 'lag' 

# A tibble: 15 x 5
# Groups:   productage [7]
    month productage aveprice   UVI chained
    <dbl> <chr>         <dbl> <dbl>   <dbl>
 1 201504 1              55.0 1.00    1.00 
 2 201506 1              49.0 0.891   0.891
 3 201508 1              65.0 1.33    1.18 
 4 201505 2              55.0 1.00    1.00 
 5 201507 2              49.0 0.891   0.891
 6 201509 2              65.0 1.33    1.18 
 7 201506 3              53.2 1.00    1.00 
 8 201508 3              49.0 0.922   0.922
 9 201510 3              65.0 1.33    1.22 
10 201507 4              27.5 1.00    1.00 
11 201509 4              49.0 1.78    1.78 
12 201508 5              27.5 1.00    1.00 
13 201510 5              49.0 1.78    1.78 
14 201509 6              27.5 1.00    1.00 
15 201510 7              27.5 1.00    1.00 

现在,您只需使用split即可按列产生的量进行拆分