代表:
# Reprex
library(dplyr)
library(weights)
df <- data.frame(
var1 = c(1, 1, 1, 2, 1, 2, 1, 2, 2, 1),
var2 = c(1, 2, 2, 3, 3, 3, 2, 1, 2, 2),
var3 = c("A", "B", "A", "A", "A", "B", "A", "B", "A", "A"),
weight = rnorm(10)
)
# sub1
sub <- filter(df, var1 == 1 & var2 == 3)
round(weights::wpct(sub$var3, weight = sub$weight), digits = 2)
# sub2
sub <- filter(df, var1 == 2)
round(weights::wpct(sub$var3, weight = sub$weight), digits = 2)
# sub3
sub <- filter(df, var2 == 2)
round(weights::wpct(sub$var3, weight = sub$weight), digits = 2)
# Looking for more efficient way to continue subgroups (with more vars and combinations)
答案 0 :(得分:1)
使用data.table
的{{1}}函数非常简单。该函数可用于计算多个变量内的所有分组以及整体分组的函数。但是,在cube
中,我们遇到了一个小问题,因为它期望仅输出一个值,而函数data.table
为wpct
(x
中的每个组提供一个值我们的情况)。幸运的是,该函数将其命名为输出,因此将结果封装为var3
将使我们能够将其转换为可读格式。
result = list(weights::wpct(var3, weights))
分组显示在“ var1”和“ var2”中,这也会计算所有整体组(例如set.seed(1)
library(data.table)
library(weights)
df <- data.frame(
var1 = c(1, 1, 1, 2, 1, 2, 1, 2, 2, 1),
var2 = c(1, 2, 2, 3, 3, 3, 2, 1, 2, 2),
var3 = c("A", "B", "A", "A", "A", "B", "A", "B", "A", "A"),
weight = rnorm(10)
)
setDT(df)
# Note that I use list(weights::wpct(var3, weight)),
# because I want to keep the result in *one* column.
res <- cube(df,
j = c(list(result = list(weights::wpct(var3, weight)))),
by = c('var1', 'var2'))
res
## Output
var1 var2 result
1: 1 1 1
2: 1 2 1.3907765,-0.3907765
3: 2 3 2.058925,-1.058925
4: 1 3 1
5: 2 1 1
6: 2 2 1
7: 1 NA 1.2394648,-0.2394648
8: 2 NA 1.03932354,-0.03932354
9: NA 1 -5.599793, 6.599793
10: NA 2 -0.7351568, 1.7351568
11: NA 3 1.7429624,-0.7429624
12: NA NA 0.92322427,0.07677573
和var1 = 1
和var2 = *any*
)。
但是,正如我在上面指出的那样,这个结果几乎是不可读的。不过,我们可以通过使用var1, var2 = *any*
中的unnest_wider
将tidyr
列分解为更好的格式来简单地解决此问题
result
现在我们有了一种可读的格式,其中前两列指示分组,其余列指示变量library(dplyr)
library(tidyr)
res %>% unnest_wider(result)
# A tibble: 12 x 4
var1 var2 A B
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 NA
2 1 2 1.39 -0.391
3 2 3 2.06 -1.06
4 1 3 1 NA
5 2 1 NA 1
6 2 2 1 NA
7 1 NA 1.24 -0.239
8 2 NA 1.04 -0.0393
9 NA 1 -5.60 6.60
10 NA 2 -0.735 1.74
11 NA 3 1.74 -0.743
12 NA NA 0.923 0.0768
的每个值的结果。请注意,如果在var3
+ NA
的特定组中没有var3
的值,则返回var1
。
答案 1 :(得分:0)
您可能会遇到这样的事情:
multi_filter <- function(data, ...)
{
filter_vars <- rlang::enquos(...)
all_vals <- lapply(filter_vars, function(x) {
data %>% dplyr::summarize(vals = unique(!!x)) %>% dplyr::pull(vals)
})
val_df <- expand.grid(all_vals)
only_first <- lapply(seq(nrow(val_df)), function(i) {
filter(data, !!(filter_vars[[1]]) == val_df[i, 1])
})
only_second <- lapply(seq(nrow(val_df)), function(i) {
filter(data, !!(filter_vars[[2]]) == val_df[i, 2])
})
both <- lapply(seq(nrow(val_df)), function(i) {
filter(data, !!(filter_vars[[1]]) == val_df[i, 1] &
!!(filter_vars[[2]]) == val_df[i, 2])
})
lapply(c(only_first, only_second, both), function(sub) {
round(weights::wpct(sub$var3, weight = sub$weight), digits = 2)
})
}
会产生每种组合,如下所示:
multi_filter(df, var1, var2)
#> [[1]]
#> A B
#> 0.47 0.53
#>
#> [[2]]
#> A B
#> 0.35 0.65
#>
#> [[3]]
#> A B
#> 0.47 0.53
#>
#> [[4]]
#> A B
#> 0.35 0.65
#>
#> [[5]]
#> A B
#> 0.47 0.53
#>
#> [[6]]
#> A B
#> 0.35 0.65
#>
#> [[7]]
#> A B
#> 1.16 -0.16
#>
#> [[8]]
#> A B
#> 1.16 -0.16
#>
#> [[9]]
#> A B
#> 0.38 0.62
#>
#> [[10]]
#> A B
#> 0.38 0.62
#>
#> [[11]]
#> A B
#> 0.73 0.27
#>
#> [[12]]
#> A B
#> 0.73 0.27
#>
#> [[13]]
#> A
#> 1
#>
#> [[14]]
#> B
#> 1
#>
#> [[15]]
#> A B
#> 0.39 0.61
#>
#> [[16]]
#> A
#> 1
#>
#> [[17]]
#> A
#> 1
#>
#> [[18]]
#> A B
#> 0.4 0.6