我有这张桌子
> data.frame(user = c("x", "y"), item = c("a", "a"), level = c(1, 1), level_max = c(2, 4))
user item level level_max
1 x a 1 2
2 y a 1 4
每个项目的我想计算级别之间每个级的不同用户的数量(最小值)和 level_max 。所以预期的输出是:
> data.frame(item = c("a", "a", "a", "a"), level = 1:4, count = c(2, 2, 1, 1))
item level count
1 a 1 2
2 a 2 2
3 a 3 1
4 a 4 1
我的方法是“扩展”数据框,以便为每个用户,项目和级别获得一行:
exapanded_df <- do.call(rbind.data.frame, apply(x, 1L, function(x) {
range <- x["level"]:x["level_max"]
data.frame(user = rep(x["user"], length(range)),
item = rep(x["item"], length(range)),
level = range)
}))
然后使用dplyr
library(dplyr)
exapanded_df %>%
group_by(item, level) %>%
summarize(count = n_distinct(user))
但我有很多行,而apply
方法效率不高,还有其他选择吗?感谢
答案 0 :(得分:2)
我们可以扩展数据框并计算这样的数字,尽管它可能不是最有效的解决方案。
app/design/frontend/[themeparent]/[themename]/Magento_Checkout/templates/success.phtml
对于基准测试,我们可以伪造更大的数据:
library(tidyverse)
dat2 <- dat %>%
mutate(level = map2(level, level_max, `:`)) %>%
unnest() %>%
count(item, level)
dat2
# # A tibble: 4 x 3
# item level n
# <fct> <int> <int>
# 1 a 1 2
# 2 a 2 2
# 3 a 3 1
# 4 a 4 1
与@ Frank的方法相比:
set.seed(1) ; x <- data.frame(
user = rep(seq_len(1e3), each = 26),
item = rep(letters, times = 1e3),
level = sample(1:5, 1e3*26, replace = TRUE),
level_max = sample(5:9, 1e3*26, replace = TRUE)
)
microbenchmark::microbenchmark(
a = do.call(rbind.data.frame, apply(x, 1L, function(x) {
range <- x["level"]:x["level_max"]
data.frame(user = rep(x["user"], length(range)),
item = rep(x["item"], length(range)),
level = range)
})) %>%
group_by(item, level) %>%
summarize(count = n_distinct(user)),
b = mutate(x, level = map2(level, level_max, seq)) %>%
unnest() %>%
count(item, level),
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# a 33489.1391 36795.5105 36460.3686 37205.2517 37284.5728 37527.3690 5
# b 407.6839 454.4582 461.4137 464.6317 480.3397 499.9549 5
答案 1 :(得分:2)
# enumerate where we want counts
library(purrr)
library(tidyr)
out = x %>% group_by(item) %>%
summarise(level = map2(min(level), max(level_max), seq)) %>%
unnest(level)
# count based on conditions
out %>% mutate(n = count_matches(., x, item == item, level <= level, level_max >= level))
item level n
1 a 1 2
2 a 2 2
3 a 3 1
4 a 4 1
其中count_matches
是辅助函数:
library(data.table)
count_matches = function(DF, targetDF, ...){
onexpr = substitute(list(...))
data.table(targetDF)[data.table(DF), on=eval(onexpr), allow.cart=TRUE, .N, by=.EACHI]$N
}
如果您不想使用purrr和tidyr,那么该部分的“不现代”方法将是
out = x %>% group_by(item) %>%
do({data.frame(level = min(.$level):max(.$level_max))})