在两个值之间展开组并计算唯一的出现次数

时间:2018-05-03 16:26:36

标签: r dplyr

我有这张桌子

> data.frame(user = c("x", "y"), item = c("a", "a"), level = c(1, 1), level_max = c(2, 4))
  user item level level_max
1    x    a     1         2
2    y    a     1         4
每个项目的

我想计算级别之间每个的不同用户的数量(最小值)和 level_max 。所以预期的输出是:

> data.frame(item = c("a", "a", "a", "a"), level = 1:4, count = c(2, 2, 1, 1))
  item level count
1    a     1     2
2    a     2     2
3    a     3     1
4    a     4     1

我的方法是“扩展”数据框,以便为每个用户,项目和级别获得一行:

exapanded_df <- do.call(rbind.data.frame, apply(x, 1L, function(x) { 
  range <- x["level"]:x["level_max"]
  data.frame(user = rep(x["user"], length(range)), 
             item = rep(x["item"], length(range)), 
             level = range)
}))

然后使用dplyr

进行分组和计数
library(dplyr)
exapanded_df %>%
  group_by(item, level) %>% 
  summarize(count = n_distinct(user))

但我有很多行,而apply方法效率不高,还有其他选择吗?感谢

2 个答案:

答案 0 :(得分:2)

我们可以扩展数据框并计算这样的数字,尽管它可能不是最有效的解决方案。

app/design/frontend/[themeparent]/[themename]/Magento_Checkout/templates/success.phtml

对于基准测试,我们可以伪造更大的数据:

library(tidyverse)

dat2 <- dat %>%
  mutate(level = map2(level, level_max, `:`)) %>%
  unnest() %>%
  count(item, level)
dat2
# # A tibble: 4 x 3
#   item  level     n
#   <fct> <int> <int>
# 1 a         1     2
# 2 a         2     2
# 3 a         3     1
# 4 a         4     1

与@ Frank的方法相比:

set.seed(1) ; x <- data.frame(
  user = rep(seq_len(1e3), each = 26),
  item = rep(letters, times = 1e3),
  level = sample(1:5, 1e3*26, replace = TRUE),
  level_max = sample(5:9, 1e3*26, replace = TRUE)
) 

microbenchmark::microbenchmark(
  a = do.call(rbind.data.frame, apply(x, 1L, function(x) { 
    range <- x["level"]:x["level_max"]
    data.frame(user = rep(x["user"], length(range)), 
               item = rep(x["item"], length(range)), 
               level = range)
  })) %>%
    group_by(item, level) %>% 
    summarize(count = n_distinct(user)),
  b = mutate(x, level = map2(level, level_max, seq)) %>% 
    unnest() %>% 
    count(item, level),
  times = 5
)


# Unit: milliseconds
#  expr        min         lq       mean     median         uq        max neval
#     a 33489.1391 36795.5105 36460.3686 37205.2517 37284.5728 37527.3690     5
#     b   407.6839   454.4582   461.4137   464.6317   480.3397   499.9549     5

答案 1 :(得分:2)

# enumerate where we want counts
library(purrr)
library(tidyr)
out = x %>% group_by(item) %>% 
  summarise(level = map2(min(level), max(level_max), seq)) %>% 
  unnest(level)

# count based on conditions    
out %>% mutate(n = count_matches(., x, item == item, level <= level, level_max >= level))

  item level n
1    a     1 2
2    a     2 2
3    a     3 1
4    a     4 1

其中count_matches是辅助函数:

library(data.table)
count_matches = function(DF, targetDF, ...){
  onexpr = substitute(list(...))
  data.table(targetDF)[data.table(DF), on=eval(onexpr), allow.cart=TRUE, .N, by=.EACHI]$N
}

如果您不想使用purrr和tidyr,那么该部分的“不现代”方法将是

out = x %>% group_by(item) %>% 
  do({data.frame(level = min(.$level):max(.$level_max))})

(披露:I am reposting this helper function from yesterday. ..)