按组分组的数据框 - 按范围分隔列的值

时间:2016-12-27 19:55:53

标签: r casting dplyr

我的数据框如下:

parent<- c('a', 'b', 'c', 'd', 
         'e', 'f', 'g', 'h', 
         'i', 'j', 'k', 'l',
         'm', 'n', 'o', 'p',
         'q', 'r', 's', 't', 
         'u', 'v', 'w', 'x',
         'y', 'z')
child<- c('A', 'B', 'C', 'D', 
         'E', 'F', 'G', 'H', 
         'I', 'J', 'K', 'L',
         'M', 'N', 'O', 'P',
         'Q', 'R', 'S', 'T', 
         'U', 'V', 'W', 'X',
         'Y', 'Z')
Type<- c('desktop', 'desktop', 'desktop', 'desktop', 
         'desktop', 'desktop', 'desktop', 'desktop', 
         'desktop', 'desktop', 'desktop', 'desktop',
         'desktop', 'desktop', 'desktop', 'desktop',
         'desktop', 'desktop', 'desktop', 'desktop', 
         'desktop', 'desktop', 'desktop', 'desktop',
         'desktop', 'desktop')
Size<- c('MEDIUM', 'MEDIUM', 'LARGE', 'LARGE', 
         'SMALL', 'MEDIUM', 'LARGE', 'SMALL', 
         'MEDIUM', 'SMALL', 'LARGE', 'LARGE',
         'SMALL', 'SMALL', 'LARGE', 'LARGE',
         'MEDIUM', 'SMALL', 'SMALL', 'MEDIUM', 
         'LARGE', 'MEDIUM', 'SMALL', 'MEDIUM',
         'LARGE', 'MEDIUM')
Revenue<- c(22138.16, 18617.94, 12394.36, 10535.76, 
         8901.41, 7320.17, 3821.40, 2811.50, 
         2483.10, 2145.76, 2138.41, 2037.67,
         1950.52, 1837.93, 1737.68, 1554.61,
         1374.40, 1334.02, 1214.60, 1191.41, 
         1189.56, 1174.55, 1162.80, 1131.29,
         1127.05, 1108.53)
NumberofSales<- c(1954720, 5129937, 1086104, 970326, 
                  1608012, 746613, 333424, 236643, 
                  352294, 587541, 209218, 342455,
                  192670, 340580, 275260, 248049,
                  251790, 128845, 303515, 112218, 
                  149878, 226633, 194973, 103425,
                  101819, 114570)
Price<- c(11.325489, 3.629273, 11.411762, 10.857959, 
          5.535661, 9.804504, 11.461083, 11.880766, 
          7.048374, 3.652103, 10.220966, 5.950183,
          10.123631, 5.396471, 6.312868, 6.267350,
          5.458517, 10.353681, 4.001779, 10.616924, 
          7.936855, 5.182608, 5.963908, 10.938264,
          11.069152, 9.675570)
Opps<- c(5144351, 6038044, 2354341, 4578272, 
         7197544, 474510, 1045528, 181471, 
         1071631, 801038, 928563, 477870,
         590497, 849537, 410179, 432703,
         1983993, 330478, 939806, 191824, 
         283107, 575004, 256846, 249530,
         142318, 2036363)
df<-data.frame(parent, child, Type, Size, 
               Revenue, NumberofSales, Price, Opps)

这就是它的样子:

df

   parent child    Type   Size  Revenue NumberofSales     Price    Opps
1       a     A desktop MEDIUM 22138.16       1954720 11.325489 5144351
2       b     B desktop MEDIUM 18617.94       5129937  3.629273 6038044
3       c     C desktop  LARGE 12394.36       1086104 11.411762 2354341
4       d     D desktop  LARGE 10535.76        970326 10.857959 4578272
5       e     E desktop  SMALL  8901.41       1608012  5.535661 7197544
6       f     F desktop MEDIUM  7320.17        746613  9.804504  474510
7       g     G desktop  LARGE  3821.40        333424 11.461083 1045528
8       h     H desktop  SMALL  2811.50        236643 11.880766  181471
9       i     I desktop MEDIUM  2483.10        352294  7.048374 1071631
10      j     J desktop  SMALL  2145.76        587541  3.652103  801038
11      k     K desktop  LARGE  2138.41        209218 10.220966  928563
12      l     L desktop  LARGE  2037.67        342455  5.950183  477870
13      m     M desktop  SMALL  1950.52        192670 10.123631  590497
14      n     N desktop  SMALL  1837.93        340580  5.396471  849537
15      o     O desktop  LARGE  1737.68        275260  6.312868  410179
16      p     P desktop  LARGE  1554.61        248049  6.267350  432703
17      q     Q desktop MEDIUM  1374.40        251790  5.458517 1983993
18      r     R desktop  SMALL  1334.02        128845 10.353681  330478
19      s     S desktop  SMALL  1214.60        303515  4.001779  939806
20      t     T desktop MEDIUM  1191.41        112218 10.616924  191824
21      u     U desktop  LARGE  1189.56        149878  7.936855  283107
22      v     V desktop MEDIUM  1174.55        226633  5.182608  575004
23      w     W desktop  SMALL  1162.80        194973  5.963908  256846
24      x     X desktop MEDIUM  1131.29        103425 10.938264  249530
25      y     Y desktop  LARGE  1127.05        101819 11.069152  142318
26      z     Z desktop MEDIUM  1108.53        114570  9.675570 2036363

我想创建一个数据框,显示Price BY SizeType的分布,以及这些Price范围的所有适当指标。我希望最终的数据框看起来像这样。 (我没有对度量值进行聚合,因为它占用了我目前正在进行的方式太长时间,这就是为什么它们现在都是一样的,但最终的答案应该具有所有不同的值)

       Type    Size     Price Range    SUM_Opps   SUM_NumberofSales  SUM_Revenue 
1   desktop   LARGE        $3-$3.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE        $4-$4.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE        $5-$5.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE        $6-$6.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE        $7-$7.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE        $8-$8.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE        $9-$9.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE      $10-$10.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE      $11-$11.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE      $12-$12.99   9,143,587           2,531,983    $8,453.93
1   desktop   LARGE     $13-Greater   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM        $3-$3.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM        $4-$4.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM        $5-$5.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM        $6-$6.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM        $7-$7.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM        $8-$8.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM        $9-$9.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM      $10-$10.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM      $11-$11.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM      $12-$12.99   9,143,587           2,531,983    $8,453.93
1   desktop  MEDIUM     $13-Greater   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL        $3-$3.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL        $4-$4.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL        $5-$5.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL        $6-$6.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL        $7-$7.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL        $8-$8.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL        $9-$9.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL      $10-$10.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL      $11-$11.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL      $12-$12.99   9,143,587           2,531,983    $8,453.93
1   desktop   SMALL     $13-Greater   9,143,587           2,531,983    $8,453.93

如何创建上表?上表显示了OPPSNumber of SalesRevenue BY TypeSizePrice Range的总和。

我理解如何使用dplyr进行简单聚合,但困难的部分是进行价格分配。

任何帮助都会很棒,谢谢!

2 个答案:

答案 0 :(得分:2)

您可以使用Hmisc::cut2()生成价格区间作为因素的级别:

library(Hmisc)
library(dplyr)

df$cut_Price <- cut2(df$Price, cuts = 4:13)

df %>% group_by(cut_Price, Size, Type) %>%
    summarise_at(c("Opps", "NumberofSales", "Revenue"),"sum") %>%
    arrange(Size, cut_Price) %>% ungroup() %>%
    mutate(cut_Price = gsub("(.*, \\d\\.)00", "\\199", cut_Price))

 # A tibble: 16 × 6
       cut_Price   Size    Type    Opps NumberofSales  Revenue
           <chr> <fctr>  <fctr>   <dbl>         <dbl>    <dbl>
1  [ 5.00, 6.99)  LARGE desktop  477870        342455  2037.67
2  [ 6.00, 7.99)  LARGE desktop  842882        523309  3292.29
3  [ 7.00, 8.99)  LARGE desktop  283107        149878  1189.56
4  [10.00,11.00)  LARGE desktop 5506835       1179544 12674.17
5  [11.00,12.00)  LARGE desktop 3542187       1521347 17342.81
6  [ 3.63, 4.99) MEDIUM desktop 6038044       5129937 18617.94
7  [ 5.00, 6.99) MEDIUM desktop 2558997        478423  2548.95
8  [ 7.00, 8.99) MEDIUM desktop 1071631        352294  2483.10
9  [ 9.00,10.00) MEDIUM desktop 2510873        861183  8428.70
10 [10.00,11.00) MEDIUM desktop  441354        215643  2322.70
11 [11.00,12.00) MEDIUM desktop 5144351       1954720 22138.16
12 [ 3.63, 4.99)  SMALL desktop  801038        587541  2145.76
13 [ 4.00, 5.99)  SMALL desktop  939806        303515  1214.60
14 [ 5.00, 6.99)  SMALL desktop 8303927       2143565 11902.14
15 [10.00,11.00)  SMALL desktop  920975        321515  3284.54
16 [11.00,12.00)  SMALL desktop  181471        236643  2811.50

如果你想将切割调整到每0.5而不是1,你可以这样做,因为传递给cut = ...的向量定义了“切割点”:

df$cut_Price <- cut2(df$Price, cuts = seq(4,13,.5))

答案 1 :(得分:0)

这将添加价格箱

library(dplyr)
df %>%
  mutate(price_bin=ifelse(Price>13, 13, floor(Price))) %>%
  group_by(Type, Size, price_bin) %>%
  summarise(sum_opps=sum(Opps), sum_sales=sum(NumberofSales), sum_revenue=sum(Revenue))

<强>更新

在不需要额外的库

的情况下,如果返回的结果与接受的答案相同,则不确定为什么会有投票结果
      Type   Size price_bin sum_opps sum_sales  sum_revenue
    <fctr> <fctr>     <dbl>    <dbl>     <dbl>        <dbl>
1  desktop  LARGE         5   477870    342455      2037.67
2  desktop  LARGE         6   842882    523309      3292.29
3  desktop  LARGE         7   283107    149878      1189.56
4  desktop  LARGE        10  5506835   1179544     12674.17
5  desktop  LARGE        11  3542187   1521347     17342.81
6  desktop MEDIUM         3  6038044   5129937     18617.94
7  desktop MEDIUM         5  2558997    478423      2548.95
8  desktop MEDIUM         7  1071631    352294      2483.10
9  desktop MEDIUM         9  2510873    861183      8428.70
10 desktop MEDIUM        10   441354    215643      2322.70
11 desktop MEDIUM        11  5144351   1954720     22138.16
12 desktop  SMALL         3   801038    587541      2145.76
13 desktop  SMALL         4   939806    303515      1214.60
14 desktop  SMALL         5  8303927   2143565     11902.14
15 desktop  SMALL        10   920975    321515      3284.54
16 desktop  SMALL        11   181471    236643      2811.50