使用R进行组乘法

时间:2017-06-20 17:52:14

标签: r dataframe data.table dplyr mutate

我想要每组日期的组倍增。 格式1 - 预期输出应如下:

 date      Bucket             D        DP    
1/31/2013   bkt 0             NA
1/31/2013   bkt 1(10-20)      NA
1/31/2013   bkt 2(20-30)      NA
1/31/2013   bkt 3(30-40)      NA
1/31/2013   bkt 4(40+)        NA
2/28/2013   bkt 0             NA
2/28/2013   bkt 1(10-20)      3.00
2/28/2013   bkt 2(20-30)      3.63
2/28/2013   bkt 3(30-40)      101
2/28/2013   bkt 4(40+)        102
3/30/2013   bkt 0             NA
3/30/2013   bkt 1(10-20)      0.55
3/30/2013   bkt 2(20-30)      0.40
3/30/2013   bkt 3(30-40)      103
3/30/2013   bkt 4(40+)        104
4/31/2013   bkt 0             NA
4/31/2013   bkt 1(10-20)      4.25              
4/31/2013   bkt 2(20-30)      3.65              
4/31/2013   bkt 3(30-40)      105        
4/31/2013   bkt 4(40+)        106        
5/30/2013   bkt 0             NA
5/30/2013   bkt 1(10-20)      2.34  13608   (108 * 105 *  0.40 *  3.00)         
5/30/2013   bkt 2(20-30)      4.10  4536    (108 * 105 * 0.40)                  
5/30/2013   bkt 3(30-40)      107   11340   (108 * 105)   
5/30/2013   bkt 4(40+)        108   108     (108)

格式2 为了更多的理解 - 我已按以下方式安排数据:

1/31/2013  2/28/2013  3/30/2013 4/31/2013  5/30/2013  DP   
NA         NA         NA        NA         NA         NA
NA         3.00       0.55      4.25       2.34       13608   (108 * 105 *  0.40 *  3.00)
NA         3.63       0.40      3.65       4.10       4536    (108 * 105 * 0.40)
NA         101        103       105        107        11340   (108 * 105)
NA         102        104       106        108        108     (108)

但我的CSV文件格式为1: 请帮助,在此先感谢

3 个答案:

答案 0 :(得分:0)

我制作了一些与你的格式相同的数据:

set.seed(1)
DF <- data.frame( date = rep(c(ymd("2017-01-01", "2016-01-01", "2015-01-01", "2014-01-01", "2013-01-01")), each=5),
                  group = rep(1:5, each=5),
                  value = rnorm(25))
head(DF)

         date group       value
1  2017-01-01     1 -0.05612874
2  2017-01-01     1 -0.15579551
3  2017-01-01     1 -1.47075238
4  2017-01-01     1 -0.47815006

我是这样做的:

M <- matrix(DF$value, ncol=5)
diag.M <- diag(M)
diag.product <- sapply(5:1, function(x) prod(diag.M[1:x]))
cbind(M, diag.product) %>%
      as.data.frame()

我将DF转换为matrix。我取矩阵的diag(对角线值列表)。我使用diag计算sapply的产品,从5到1,4对1,3到1等。我将diag.product的cbind列绑定到矩阵并转换为data.frame

答案 1 :(得分:0)

对于 date Bucket 因子级别的每个组合,使用prod考虑滑动条件by乘法。为了验证,下面打印出每个乘法中的因子,但不需要print行内部函数:

数据

df <- read.table(text='date      Bucket             D   
                 "1/31/2013"   "bkt 0"             NA
                 "1/31/2013"   "bkt 1(10-20)"      NA
                 "1/31/2013"   "bkt 2(20-30)"      NA
                 "1/31/2013"   "bkt 3(30-40)"      NA
                 "1/31/2013"   "bkt 4(40+)"        NA
                 "2/28/2013"   "bkt 0"             NA
                 "2/28/2013"   "bkt 1(10-20)"      3.00
                 "2/28/2013"   "bkt 2(20-30)"      3.63
                 "2/28/2013"   "bkt 3(30-40)"      101
                 "2/28/2013"   "bkt 4(40+)"        102
                 "3/30/2013"   "bkt 0"             NA
                 "3/30/2013"   "bkt 1(10-20)"      0.55
                 "3/30/2013"   "bkt 2(20-30)"      0.40
                 "3/30/2013"   "bkt 3(30-40)"      103
                 "3/30/2013"   "bkt 4(40+)"        104
                 "4/31/2013"   "bkt 0"             NA
                 "4/31/2013"   "bkt 1(10-20)"      4.25              
                 "4/31/2013"   "bkt 2(20-30)"      3.65              
                 "4/31/2013"   "bkt 3(30-40)"      105        
                 "4/31/2013"   "bkt 4(40+)"        106        
                 "5/30/2013"   "bkt 0"             NA
                 "5/30/2013"   "bkt 1(10-20)"      2.34
                 "5/30/2013"   "bkt 2(20-30)"      4.10
                 "5/30/2013"   "bkt 3(30-40)"      107
                 "5/30/2013"   "bkt 4(40+)"        108
                 "6/30/2013"   "bkt 0"             NA
                 "6/30/2013"   "bkt 1(10-20)"      4.00
                 "6/30/2013"   "bkt 2(20-30)"      5.00
                 "6/30/2013"   "bkt 3(30-40)"      109.00
                 "6/30/2013"   "bkt 4(40+)"        110.00
                 "7/30/2013"   "bkt 0"             NA
                 "7/30/2013"   "bkt 1(10-20)"      8.00
                 "7/30/2013"   "bkt 2(20-30)"      7.00
                 "7/30/2013"   "bkt 3(30-40)"      111.00
                 "7/30/2013"   "bkt 4(40+)"        112.00', header=TRUE)

<强>代码

 # ADD LEVEL COLUMNS
df$datelvl <- as.integer(as.factor(df$date))
df$bucketlvl <- as.integer(as.factor(df$Bucket))

# RUN CONDITIONAL prod BY EACH LEVEL WITH by
df$DP <- as.numeric(by(df, df[,c("bucketlvl", "datelvl")], FUN=function(i) {

  if(i$datelvl[1] >= 5) {  # CONDITION FOR NUMBER OF BUCKET GROUP
      tmp <- as.numeric(df[(df$datelvl==i$datelvl[1] & df$bucketlvl==5) |
                           (df$datelvl==i$datelvl[1]-1 & df$bucketlvl==4) |
                           (df$datelvl==i$datelvl[1]-2 & df$bucketlvl==3) |
                           (df$datelvl==i$datelvl[1]-3 & df$bucketlvl==2) |
                           (df$datelvl==i$datelvl[1]-4 & df$bucketlvl==1) ,"D"])
  } else {
    tmp <- NA        
  } 

  print(tmp[i$bucketlvl[1]:length(tmp)])      
  prod(tmp[i$bucketlvl[1]:length(tmp)])

}))

打印输出

# [1] NA
# [1] NA NA
# [1] NA NA NA
# [1] NA NA NA NA
# [1] NA NA NA NA NA
# [1] NA
# [1] NA NA
# [1] NA NA NA
# [1] NA NA NA NA
# [1] NA NA NA NA NA
# [1] NA
# [1] NA NA
# [1] NA NA NA
# [1] NA NA NA NA
# [1] NA NA NA NA NA
# [1]     NA   3.63 103.00 106.00
# [1]   3.63 103.00 106.00
# [1] 103 106
# [1] 106
# [1]  NA 106
# [1]    NA   3.0   0.4 105.0 108.0
# [1]   3.0   0.4 105.0 108.0
# [1]   0.4 105.0 108.0
# [1] 105 108
# [1] 108
# [1]     NA   0.55   3.65 107.00 110.00
# [1]   0.55   3.65 107.00 110.00
# [1]   3.65 107.00 110.00
# [1] 107 110
# [1] 110
# [1]     NA   4.25   4.10 109.00 112.00
# [1]   4.25   4.10 109.00 112.00
# [1]   4.1 109.0 112.0
# [1] 109 112
# [1] 112

数据框输出

#         date       Bucket      D datelvl bucketlvl        DP
# 1  1/31/2013        bkt 0     NA       1         1        NA
# 2  1/31/2013 bkt 1(10-20)     NA       1         2        NA
# 3  1/31/2013 bkt 2(20-30)     NA       1         3        NA
# 4  1/31/2013 bkt 3(30-40)     NA       1         4        NA
# 5  1/31/2013   bkt 4(40+)     NA       1         5        NA
# 6  2/28/2013        bkt 0     NA       2         1        NA
# 7  2/28/2013 bkt 1(10-20)   3.00       2         2        NA
# 8  2/28/2013 bkt 2(20-30)   3.63       2         3        NA
# 9  2/28/2013 bkt 3(30-40) 101.00       2         4        NA
# 10 2/28/2013   bkt 4(40+) 102.00       2         5        NA
# 11 3/30/2013        bkt 0     NA       3         1        NA
# 12 3/30/2013 bkt 1(10-20)   0.55       3         2        NA
# 13 3/30/2013 bkt 2(20-30)   0.40       3         3        NA
# 14 3/30/2013 bkt 3(30-40) 103.00       3         4        NA
# 15 3/30/2013   bkt 4(40+) 104.00       3         5        NA
# 16 4/31/2013        bkt 0     NA       4         1        NA
# 17 4/31/2013 bkt 1(10-20)   4.25       4         2        NA
# 18 4/31/2013 bkt 2(20-30)   3.65       4         3        NA
# 19 4/31/2013 bkt 3(30-40) 105.00       4         4        NA
# 20 4/31/2013   bkt 4(40+) 106.00       4         5        NA
# 21 5/30/2013        bkt 0     NA       5         1        NA
# 22 5/30/2013 bkt 1(10-20)   2.34       5         2  13608.00
# 23 5/30/2013 bkt 2(20-30)   4.10       5         3   4536.00
# 24 5/30/2013 bkt 3(30-40) 107.00       5         4  11340.00
# 25 5/30/2013   bkt 4(40+) 108.00       5         5    108.00
# 26 6/30/2013        bkt 0     NA       6         1        NA
# 27 6/30/2013 bkt 1(10-20)   4.00       6         2  23628.28
# 28 6/30/2013 bkt 2(20-30)   5.00       6         3  42960.50
# 29 6/30/2013 bkt 3(30-40) 109.00       6         4  11770.00
# 30 6/30/2013   bkt 4(40+) 110.00       6         5    110.00
# 31 7/30/2013        bkt 0     NA       7         1        NA
# 32 7/30/2013 bkt 1(10-20)   8.00       7         2 212724.40
# 33 7/30/2013 bkt 2(20-30)   7.00       7         3  50052.80
# 34 7/30/2013 bkt 3(30-40) 111.00       7         4  12208.00
# 35 7/30/2013   bkt 4(40+) 112.00       7         5    112.00

答案 2 :(得分:0)

我提出的使用基础R的方法(数据与@Parfait相同):

# Split the group
l1 <- tapply(df$D, df$date, identity)
# Get the corresponding elements in the group and remove the first one (NA)
es <- mapply(`[`, l1, seq_along(l1))[-1]
# Then get cummulative product and put them into df
DP <- c(rep(NA, nrow(df) - length(es)),
        Reduce(`*`, es, right=TRUE, accumulate=TRUE))

df$DP <- DP
df

#         date       Bucket      D    DP
# 1  1/31/2013        bkt 0     NA    NA
# 2  1/31/2013 bkt 1(10-20)     NA    NA
# 3  1/31/2013 bkt 2(20-30)     NA    NA
# 4  1/31/2013 bkt 3(30-40)     NA    NA
# 5  1/31/2013   bkt 4(40+)     NA    NA
# 6  2/28/2013        bkt 0     NA    NA
# 7  2/28/2013 bkt 1(10-20)   3.00    NA
# 8  2/28/2013 bkt 2(20-30)   3.63    NA
# 9  2/28/2013 bkt 3(30-40) 101.00    NA
# 10 2/28/2013   bkt 4(40+) 102.00    NA
# 11 3/30/2013        bkt 0     NA    NA
# 12 3/30/2013 bkt 1(10-20)   0.55    NA
# 13 3/30/2013 bkt 2(20-30)   0.40    NA
# 14 3/30/2013 bkt 3(30-40) 103.00    NA
# 15 3/30/2013   bkt 4(40+) 104.00    NA
# 16 4/31/2013        bkt 0     NA    NA
# 17 4/31/2013 bkt 1(10-20)   4.25    NA
# 18 4/31/2013 bkt 2(20-30)   3.65    NA
# 19 4/31/2013 bkt 3(30-40) 105.00    NA
# 20 4/31/2013   bkt 4(40+) 106.00    NA
# 21 5/30/2013        bkt 0     NA    NA
# 22 5/30/2013 bkt 1(10-20)   2.34 13608
# 23 5/30/2013 bkt 2(20-30)   4.10  4536
# 24 5/30/2013 bkt 3(30-40) 107.00 11340
# 25 5/30/2013   bkt 4(40+) 108.00   108