Question

假设一个列表列表，例如代表一段时间内的市场。我们有六个时期，每个时期都有三个市场。对于每个市场，可能会有很多变量。在这里，我们对profit和sales感兴趣。我们要计算所有时段和所有市场中每个指标的平均值。这是一个玩具数据集：

periods <- list()
markets <- list()

set.seed(11)
for (i in seq(1:6)) {
    for (j in seq(1:3)) {
        markets[[j]] <- list(profit =  sample(1:100, 1), sales =  sample(1:10, 1))
        }
    periods[[i]] <- markets
}

这里有一些实际数据：

list(list(list(profit = 28L, sales = 1L), list(profit = 52L, 
    sales = 1L), list(profit = 7L, sales = 10L)), list(list(profit = 9L, 
    sales = 3L), list(profit = 89L, sales = 2L), list(profit = 18L, 
    sales = 5L)), list(list(profit = 91L, sales = 9L), list(profit = 74L, 
    sales = 6L), list(profit = 49L, sales = 4L)), list(list(profit = 16L, 
    sales = 5L), list(profit = 21L, sales = 7L), list(profit = 37L, 
    sales = 4L)), list(list(profit = 7L, sales = 5L), list(profit = 40L, 
    sales = 1L), list(profit = 13L, sales = 4L)), list(list(profit = 51L, 
    sales = 4L), list(profit = 42L, sales = 3L), list(profit = 82L, 
    sales = 7L)))

预期输出：

profit == 40,33
sales == 4,5

现在，我可以使用循环来实现。但我真的很想看到一个更优雅的解决方案。最好使用tidyverse。对于简单的值列表，我可以使用类似periods %>% map('sales') %>% unlist() %>% mean()的值。但是，到目前为止，在这种更复杂的情况下，我失败了。

Answer 1

您可以尝试

df %>% 
   flatten() %>% 
   flatten() %>% 
   tibble(a=names(.), b=unlist(.)) %>% 
   group_by(a) %>% 
   summarise(Mean=mean(b))
# A tibble: 2 x 2
  a       Mean
  <chr>  <dbl>
1 profit  40.3
2 sales    4.5

或者只是运行

df %>% 
   reduce(bind_rows) %>% 
   summarise_all(mean)
# A tibble: 1 x 2
  profit sales
   <dbl> <dbl>
1   40.3   4.5

Answer 2

取消列出后，您可以很好地创建一个矩阵并将其汇总。

summary(matrix(unlist(periods), , 2, by=TRUE))
#       V1              V2       
# Min.   : 3.00   Min.   : 1.00  
# 1st Qu.:29.00   1st Qu.: 2.25  
# Median :46.50   Median : 4.00  
# Mean   :47.67   Mean   : 5.00  
# 3rd Qu.:67.50   3rd Qu.: 7.00  
# Max.   :89.00   Max.   :10.00  
summary(matrix(unlist(markets), , 2, by=TRUE))
#       V1              V2   
# Min.   :29.00   Min.   :4  
# 1st Qu.:44.50   1st Qu.:5  
# Median :60.00   Median :6  
# Mean   :56.33   Mean   :6  
# 3rd Qu.:70.00   3rd Qu.:7  
# Max.   :80.00   Max.   :8  

summary(matrix(unlist(list(periods, markets)), , 2, by=TRUE))
#       V1             V2        
# Min.   : 3.0   Min.   : 1.000  
# 1st Qu.:29.0   1st Qu.: 3.000  
# Median :48.0   Median : 4.000  
# Mean   :48.9   Mean   : 5.143  
# 3rd Qu.:70.0   3rd Qu.: 7.000  
# Max.   :89.0   Max.   :10.000

Answer 3

我们可以使用map来遍历periods中的每个列表，并分别选择"profit"和"sales"列，然后再使用mean。

library(tidyverse)

tibble(
  profit = map(periods, ~pluck(., 1) %>% flatten_dbl) %>% flatten_dbl %>% mean, 
  sales = map(periods, ~pluck(., 2) %>% flatten_dbl) %>% flatten_dbl %>% mean
 )

# A tibble: 1 x 2
#  profit sales
#   <dbl> <dbl>
#1   40.3   4.5

获取嵌套在列表列表中的变量的总均值

3 个答案: