Question

我有一个小格式的非常大的数据集。我想使用一些返回列表的函数来汇总数据。我对列表的几个组件感兴趣，我想将我需要的每个组件返回到新的标题栏中。

这是一个例子

library(tibble)
library(dplyr)

# Create a data set of 1,000 random values in 100 subgroups with sample size 10
contrived_data <- tibble(subgroup = rep(1:100, each = 10),
                         value    = rnorm(1000, mean = 5, sd = 1))


# Run the KS test vs. normal distribution on each sample of size 10. Return the KS statistic and p-value
# into new tibble columns
contrived_data %>% group_by(subgroup) %>%
  summarize(avg     = mean(value),
            std_dev = sd(value),
            ks_stat = ks.test(value, "pnorm", mean = 5, sd = 1)$statistic,
            ks_pval = ks.test(value, "pnorm", mean = 5, sd = 1)$p.value)

以这种方式运行它可以获得我想要的结果，但是效率却不高。两次调用ks.test函数意味着执行时间几乎翻倍。似乎必须有一种更有效的方法来通过一个函数调用提取这两个列表组件，但是我不知道该怎么做。

Answer 1

使用dplyr命令的rowwise解决方案，其执行的任务与map相同。

contrived_data %>%
      group_by(subgroup) %>%
      summarise(
        avg = mean(value),
        std_dev = sd(value),
        ks_test = list(ks.test(value,"pnorm",mean=5,sd=1))
      ) %>%
      ungroup() %>%
      rowwise() %>%
      mutate(
        ks_stat = ks_test$statistic,
        ks_pval = ks_test$p.value
      ) %>%
      ungroup() %>%
      select(-ks_test)

# A tibble: 100 x 5
#   subgroup   avg std_dev ks_stat ks_pval
#      <int> <dbl>   <dbl>   <dbl>   <dbl>
# 1        1  5.10   1.24    0.186  0.819 
# 2        2  4.86   0.805   0.231  0.584 
# 3        3  5.24   0.729   0.258  0.445 
# 4        4  5.16   0.642   0.307  0.247 
# 5        5  4.63   0.752   0.393  0.0664

# Benchmark using rbenchmark:
#      test replications elapsed relative user.self sys.self user.child sys.child
#2   nested         1000   10.58    1.000     10.58        0         NA        NA
#1 original         1000   16.75    1.583     16.73        0         NA        NA

Answer 2

您可以定义函数并使用来自purrr的地图：

library(tibble)
library(dplyr)
library(purrr)

func = function(DA){
kstest = ks.test(DA$value, "pnorm", mean = 5, sd = 1)
data.frame(
subgroup = unique(DA$subgroup),
avg=mean(DA$value),
std_dev = sd(DA$value),
ks_stat = kstest$statistic,
ks_pval = kstest$p.value)
}

contrived_data %>% 
split(.$subgroup) %>%
map_dfr(func)

Answer 3

测试可以运行一次并包装在class TestService { public function __construct(EntityManager $emA, EntityManager $emB) { … } }中，然后使用list（来自map）来提取值

purrr

另一种选择是library(purrr) library(dplyr) library(tidyr) contrived_data %>% group_by(subgroup) %>% summarize(avg = mean(value), std_dev = sd(value), test = list(ks.test(value, "pnorm", mean = 5, sd = 1))) %>% mutate(out = map(test, ~ tibble(ks_stat = .x$statistic, ks_pval = .x$p.value))) %>% unnest_wider(c(out)) %>% select(-test) # A tibble: 100 x 5 # subgroup avg std_dev ks_stat ks_pval # <int> <dbl> <dbl> <dbl> <dbl> # 1 1 4.52 0.675 0.375 0.0907 # 2 2 5.17 1.02 0.342 0.152 # 3 3 5.02 0.909 0.141 0.972 # 4 4 5.08 0.846 0.313 0.227 # 5 5 4.82 0.819 0.225 0.614 # 6 6 5.07 0.866 0.159 0.928 # 7 7 4.94 0.914 0.145 0.966 # 8 8 5.52 1.01 0.290 0.306 # 9 9 5.17 0.787 0.258 0.443 #10 10 4.61 1.15 0.476 0.0132 # … with 90 more rows（使用tidy）并一次提取所有分量

broom

Answer 4

您可以使用group_modify

library(tidyverse)

contrived_data %>% 
  group_by(subgroup) %>% 
  group_modify(~{
      ks <- ks.test(.$value, "pnorm", mean = 5, sd = 1)
      tibble(
        avg = mean(.$value), 
        std_dev = sd(.$value),
        ks_stat = ks$statistic,
        ks_pval = ks$p.value) 
  })

或者使用data.table

library(data.table)
setDT(contrived_data)

contrived_data[, {
  ks <- ks.test(value, "pnorm", mean = 5, sd = 1)
  .(avg = mean(value), 
    std_dev = sd(value),
    ks_stat = ks$statistic,
    ks_pval = ks$p.value) 
}, by = subgroup]

如何将列表元素提取到R中的多个小标题列中？

4 个答案: