每个分类变量组合的平均价格 - R.

时间:2017-08-24 08:34:56

标签: r dtplyr

我正在使用钻石数据集。

> dput(diamonds_2[1:100,])
structure(list(carat = structure(c(4L, 2L, 4L, 10L, 12L, 5L, 
5L, 7L, 3L, 4L, 11L, 4L, 3L, 12L, 1L, 13L, 11L, 11L, 11L, 11L, 
11L, 4L, 4L, 12L, 12L, 4L, 5L, 11L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 12L, 7L, 14L, 14L, 14L, 7L, 7L, 13L, 10L, 13L, 13L, 6L, 
10L, 5L, 4L, 13L, 3L, 3L, 11L, 11L, 11L, 11L, 11L, 16L, 11L, 
11L, 11L, 23L, 9L, 13L, 12L, 12L, 5L, 5L, 11L, 11L, 11L, 11L, 
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 19L, 7L, 5L, 5L, 5L, 5L, 13L, 
45L, 61L, 45L, 46L, 53L, 45L, 45L, 71L, 48L, 55L), .Label = c("0.2", 
"0.21", "0.22", "0.23", "0.24", "0.25", "0.26", "0.27", "0.28", 
"0.29", "0.3", "0.31", "0.32", "0.33", "0.34", "0.35", "0.36", 
"0.37", "0.38", "0.39", "0.4", "0.41", "0.42", "0.43", "0.5", 
"0.51", "0.52", "0.53", "0.54", "0.55", "0.56", "0.57", "0.58", 
"0.59", "0.6", "0.61", "0.62", "0.63", "0.64", "0.65", "0.66", 
"0.67", "0.68", "0.69", "0.7", "0.71", "0.72", "0.73", "0.74", 
"0.75", "0.76", "0.77", "0.78", "0.79", "0.8", "0.81", "0.82", 
"0.83", "0.84", "0.85", "0.86", "0.87", "0.88", "0.89", "0.9", 
"0.91", "0.92", "0.93", "0.94", "0.95", "0.96", "0.97", "0.98", 
"0.99", "1", "1.01", "1.02", "1.03", "1.04", "1.05", "1.06", 
"1.07", "1.08", "1.09", "1.1", "1.11", "1.12", "1.13", "1.14", 
"1.15", "1.16", "1.17", "1.18", "1.19", "1.2", "1.21", "1.22", 
"1.23", "1.24", "1.25", "1.27", "1.28", "1.29", "1.31", "1.5", 
"1.51", "1.52"), class = "factor"), color = structure(c(2L, 2L, 
2L, 6L, 7L, 7L, 6L, 5L, 2L, 5L, 7L, 7L, 3L, 7L, 2L, 2L, 6L, 7L, 
7L, 7L, 6L, 2L, 5L, 7L, 7L, 4L, 6L, 7L, 1L, 3L, 3L, 3L, 2L, 2L, 
1L, 3L, 2L, 5L, 1L, 6L, 6L, 7L, 1L, 1L, 5L, 3L, 5L, 5L, 2L, 5L, 
3L, 4L, 6L, 2L, 1L, 6L, 7L, 6L, 6L, 6L, 6L, 1L, 1L, 1L, 6L, 4L, 
6L, 4L, 4L, 2L, 1L, 5L, 5L, 5L, 5L, 3L, 2L, 1L, 1L, 2L, 2L, 1L, 
2L, 6L, 2L, 4L, 5L, 5L, 5L, 6L, 2L, 2L, 4L, 2L, 4L, 2L, 3L, 3L, 
2L, 5L), .Label = c("1", "2", "3", "4", "5", "6", "7"), class = "factor"), 
    clarity = structure(c(2L, 3L, 5L, 4L, 2L, 6L, 7L, 3L, 4L, 
    5L, 3L, 5L, 3L, 2L, 2L, 1L, 2L, 3L, 3L, 3L, 2L, 4L, 5L, 3L, 
    3L, 6L, 5L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 4L, 
    2L, 2L, 3L, 4L, 5L, 2L, 3L, 2L, 2L, 4L, 2L, 3L, 5L, 3L, 4L, 
    4L, 2L, 2L, 3L, 3L, 3L, 5L, 3L, 3L, 3L, 2L, 6L, 7L, 3L, 3L, 
    7L, 7L, 3L, 3L, 3L, 3L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 6L, 2L, 
    7L, 7L, 7L, 7L, 6L, 3L, 3L, 2L, 4L, 4L, 2L, 4L, 5L, 2L, 3L, 
    3L), .Label = c("1", "2", "3", "4", "5", "6", "7", "8"), class = "factor"), 
    price = c(481, 481, 492, 558, 568, 579, 579, 590, 590, 601, 
    610, 621, 642, 660, 671, 671, 700, 729, 729, 729, 729, 740, 
    750, 750, 750, 761, 772, 793, 793, 793, 951, 951, 951, 951, 
    951, 951, 951, 951, 952, 952, 952, 952, 952, 952, 952, 952, 
    952, 952, 953, 953, 953, 953, 953, 953, 953, 954, 954, 954, 
    954, 954, 958, 958, 958, 958, 958, 959, 959, 959, 959, 959, 
    959, 960, 960, 960, 960, 960, 960, 960, 960, 960, 960, 960, 
    960, 960, 960, 960, 960, 960, 960, 960, 1, 1, 1, 2, 2, 2, 
    2, 2, 3, 3), cut_new = structure(c(1L, 1L, 2L, 1L, 2L, 3L, 
    3L, 3L, 2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 3L, 2L, 
    3L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 
    2L, 2L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 3L, 2L, 3L, 3L, 3L, 
    1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 3L, 1L, 1L, 3L, 3L, 1L, 1L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 
    3L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 3L, 3L, 2L, 
    2L, 2L, 3L, 1L), .Label = c("Above average", "Below average", 
    "Very Good"), class = "factor")), .Names = c("carat", "color", 
"clarity", "price", "cut_new"), row.names = c(NA, 100L), class = "data.frame")

现在我想获得平均价格,以便每个钻石,我看到价格,旁边也是平均价格(基于组合 color and cut_new)。

enter image description here

我已尝试过以下代码,但无法正确使用:

尝试1:

head(diamonds_2)
diamonds_2 <- x <- as.data.frame(diamonds_2)
diamonds_2$price <- as.numeric(diamonds_2$price)
mean <- tapply(diamonds_2$price, list(diamonds_2$color, diamonds_2$cut_new), mean, na.rm = T)
combine <- merge (diamonds_2, mean, by.x = "cut_new", by.y= "color")

ATTEMPT2:

results <- summaryBy(price~color, data= diamonds_2, FUN = mean)

任何想法如何使其中一个工作?

谢谢

1 个答案:

答案 0 :(得分:-1)

假设您的输入数据集为df,您可以使用以下代码段,根据colorcut_new变量获取每个钻石的平均价格:

library(dplyr)

df %>% group_by(color, cut_new) %>% 
  summarise(AvgPrice= mean(price))

# # A tibble: 20 x 3
# color       cut_new     AvgPrice
# <fctr>        <fctr>      <dbl>
#  1      1 Above average 956.7500
#  2      1 Below average 952.0000
#  3      1     Very Good 933.5714
#  4      2 Above average 647.1250
#  5      2 Below average 499.3333
#  6      2     Very Good 720.0000
#  7      3 Above average 797.0000
#  8      3 Below average 318.3333
#  9      3     Very Good 921.6000
# 10      4 Above average 766.4000
# 11      4     Very Good 574.0000
# 12      5 Above average 800.5000
# 13      5 Below average 953.7500
# 14      5     Very Good 801.0000
# 15      6 Above average 886.3333
# 16      6 Below average 841.5000
# 17      6     Very Good 829.0000
# 18      7 Above average 796.7500
# 19      7 Below average 659.0000
# 20      7     Very Good 720.2000