我有一个数据框,我已经进行了子集化。
以下是glimpse(oc_LV)
$ city <chr> "Las Vegas", "Las Vegas", "Las Vegas", "Las Vegas", "Las Vegas", "Las Ve...
$ is_open <chr> "1", "0", "1", "1", "0", "0"
$ categories <chr> "Chinese", "Chinese", "Thai", "Japanese", "Japanese", "Thai"
$ n <int> 196, 86, 54, 51, 38, 34´
这是dput(oc_LV)
structure(list(city = c("Las Vegas", "Las Vegas", "Las Vegas",
"Las Vegas", "Las Vegas", "Las Vegas"), is_open = c("1", "0",
"1", "1", "0", "0"), categories = c("Chinese", "Chinese", "Thai",
"Japanese", "Japanese", "Thai"), n = c(196L, 86L, 54L, 51L, 38L,
34L)), row.names = c(NA, -6L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = c("city", "is_open"), drop = TRUE, .Names = c("city",
"is_open", "categories", "n"), indices = list(c(1L, 4L, 5L),
c(0L, 2L, 3L)), group_sizes = c(3L, 3L), biggest_group_size = 3L, labels = structure(list(
city = c("Las Vegas", "Las Vegas"), is_open = c("0", "1")), row.names = c(NA,
-2L), class = "data.frame", vars = c("city", "is_open"), drop = TRUE, .Names = c("city",
"is_open")))
我想为变量table()
的频率,平均值和加权平均值is_open
我是这样做的
cuisine <- c("Chinese", "Thai", "Japanese")
open_fr <- c(196, 54, 51)
closed_fr <- c(86, 34, 38)
open_avg <- c(196/(196+86), 54/(54+36), 51/(51+38))
closed_avg <- c(86/(196+86), 90/(282+90+89), 38/(51+38))
open_wavg <- c(282/(282+90+89) * 196/(196+86), 90/(282+90+89) * 54/(54+36), 89/(282+90+89) * 51/(51+38))
closed_wavg <-c(282/(282+90+89) * 86/(196+86), 90/(282+90+89) * 36/(54+36), 89/(282+90+89) * 38/(51+38))
open_closed_LV <- data.frame(cuisine, open_fr, closed_fr, open_avg, closed_avg, open_wavg, closed_wavg)
open_closed_LV$open_avg <-round(open_closed_LV$open_avg, digits = 2)
open_closed_LV$closed_avg <-round(open_closed_LV$closed_avg, digits = 2)
open_closed_LV$open_wavg <-round(open_closed_LV$open_wavg, digits = 2)
open_closed_LV$closed_wavg <-round(open_closed_LV$closed_wavg, digits = 2)
结果很好,但它很复杂。
Variables: 7
$ cuisine <fctr> Chinese, Thai, Japanese
$ open_fr <dbl> 196, 54, 51
$ closed_fr <dbl> 86, 34, 38
$ open_avg <dbl> 0.70, 0.60, 0.57
$ closed_avg <dbl> 0.30, 0.20, 0.43
$ open_wavg <dbl> 0.43, 0.12, 0.11
$ closed_wavg <dbl> 0.19, 0.08, 0.08
当我尝试通过这样的函数计算数字时:
oc_LV %>% select(n) %>% mean()
我收到这样的错误:
Warning message: In mean.default(.) : argument is not numeric or logical: returning NA
是否有更聪明的方法来实现结果,就像我使用上述方法一样?在这种情况下,由于少数情况,这不是问题。但是如果案例数量增加,这种方法就不起作用了。
答案 0 :(得分:1)
你的意思是这样吗?
library(dplyr)
library(tidyr)
df <- oc_LV %>%
spread(is_open, n) %>%
`colnames<-` (c('city', 'cuisine', 'closed_fr','open_fr')) %>%
mutate(open_avg = open_fr/ rowSums(.[,-c(1,2)]),
closed_avg = closed_fr/ rowSums(.[,-c(1,2)]),
open_wavg = rowSums(.[,-c(1,2)])/sum(.[,-c(1,2)]) * open_avg,
closed_wavg = rowSums(.[,-c(1,2)])/sum(.[,-c(1,2)]) * closed_avg)
df
输出是:
# A tibble: 3 x 8
# Groups: city [1]
city cuisine closed_fr open_fr open_avg closed_avg open_wavg closed_wavg
<chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 Las Vegas Chinese 86 196 0.70 0.30 0.43 0.187
2 Las Vegas Japanese 38 51 0.57 0.43 0.11 0.083
3 Las Vegas Thai 34 54 0.61 0.39 0.12 0.074
#sample data
> dput(oc_LV)
structure(list(city = c("Las Vegas", "Las Vegas", "Las Vegas",
"Las Vegas", "Las Vegas", "Las Vegas"), is_open = c("1", "0",
"1", "1", "0", "0"), categories = c("Chinese", "Chinese", "Thai",
"Japanese", "Japanese", "Thai"), n = c(196L, 86L, 54L, 51L, 38L,
34L)), .Names = c("city", "is_open", "categories", "n"), row.names = c(NA,
-6L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars = c("city",
"is_open"), drop = TRUE, indices = list(c(1L, 4L, 5L), c(0L,
2L, 3L)), group_sizes = c(3L, 3L), biggest_group_size = 3L, labels = structure(list(
city = c("Las Vegas", "Las Vegas"), is_open = c("0", "1")), .Names = c("city",
"is_open"), row.names = c(NA, -2L), class = "data.frame", vars = c("city",
"is_open"), drop = TRUE))
答案 1 :(得分:1)
我认为你想要的东西可以通过dplyr
和tidyr
的组合来实现。
像
这样的东西with_cols <- df %>%
group_by(city, categories, is_open) %>%
summarise(n=sum(n)) %>%
spread(is_open, n)
> with_cols
# A tibble: 3 x 4
# Groups: city, categories [3]
city categories `0` `1`
* <chr> <chr> <int> <int>
1 Las Vegas Chinese 86 196
2 Las Vegas Japanese 38 51
3 Las Vegas Thai 34 54
我们可以重命名列以使其更漂亮
with_cols <- with_cols %>% rename(open=`1`, closed=`0`)
从这里可以使用简单的命令完成数据操作:
with_cols %>% mutate(open_avg = open / (open + closed))