我有一个小格式的非常大的数据集。我想使用一些返回列表的函数来汇总数据。我对列表的几个组件感兴趣,我想将我需要的每个组件返回到新的标题栏中。
这是一个例子
library(tibble)
library(dplyr)
# Create a data set of 1,000 random values in 100 subgroups with sample size 10
contrived_data <- tibble(subgroup = rep(1:100, each = 10),
value = rnorm(1000, mean = 5, sd = 1))
# Run the KS test vs. normal distribution on each sample of size 10. Return the KS statistic and p-value
# into new tibble columns
contrived_data %>% group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
ks_stat = ks.test(value, "pnorm", mean = 5, sd = 1)$statistic,
ks_pval = ks.test(value, "pnorm", mean = 5, sd = 1)$p.value)
以这种方式运行它可以获得我想要的结果,但是效率却不高。两次调用ks.test
函数意味着执行时间几乎翻倍。似乎必须有一种更有效的方法来通过一个函数调用提取这两个列表组件,但是我不知道该怎么做。
答案 0 :(得分:2)
使用dplyr
命令的rowwise
解决方案,其执行的任务与map
相同。
contrived_data %>%
group_by(subgroup) %>%
summarise(
avg = mean(value),
std_dev = sd(value),
ks_test = list(ks.test(value,"pnorm",mean=5,sd=1))
) %>%
ungroup() %>%
rowwise() %>%
mutate(
ks_stat = ks_test$statistic,
ks_pval = ks_test$p.value
) %>%
ungroup() %>%
select(-ks_test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 5.10 1.24 0.186 0.819
# 2 2 4.86 0.805 0.231 0.584
# 3 3 5.24 0.729 0.258 0.445
# 4 4 5.16 0.642 0.307 0.247
# 5 5 4.63 0.752 0.393 0.0664
# Benchmark using rbenchmark:
# test replications elapsed relative user.self sys.self user.child sys.child
#2 nested 1000 10.58 1.000 10.58 0 NA NA
#1 original 1000 16.75 1.583 16.73 0 NA NA
答案 1 :(得分:2)
您可以定义函数并使用来自purrr的地图:
library(tibble)
library(dplyr)
library(purrr)
func = function(DA){
kstest = ks.test(DA$value, "pnorm", mean = 5, sd = 1)
data.frame(
subgroup = unique(DA$subgroup),
avg=mean(DA$value),
std_dev = sd(DA$value),
ks_stat = kstest$statistic,
ks_pval = kstest$p.value)
}
contrived_data %>%
split(.$subgroup) %>%
map_dfr(func)
答案 2 :(得分:1)
测试可以运行一次并包装在class TestService
{
public function __construct(EntityManager $emA, EntityManager $emB)
{
…
}
}
中,然后使用list
(来自map
)来提取值
purrr
另一种选择是library(purrr)
library(dplyr)
library(tidyr)
contrived_data %>%
group_by(subgroup) %>%
summarize(avg = mean(value),
std_dev = sd(value),
test = list(ks.test(value, "pnorm", mean = 5, sd = 1))) %>%
mutate(out = map(test, ~ tibble(ks_stat = .x$statistic,
ks_pval = .x$p.value))) %>%
unnest_wider(c(out)) %>%
select(-test)
# A tibble: 100 x 5
# subgroup avg std_dev ks_stat ks_pval
# <int> <dbl> <dbl> <dbl> <dbl>
# 1 1 4.52 0.675 0.375 0.0907
# 2 2 5.17 1.02 0.342 0.152
# 3 3 5.02 0.909 0.141 0.972
# 4 4 5.08 0.846 0.313 0.227
# 5 5 4.82 0.819 0.225 0.614
# 6 6 5.07 0.866 0.159 0.928
# 7 7 4.94 0.914 0.145 0.966
# 8 8 5.52 1.01 0.290 0.306
# 9 9 5.17 0.787 0.258 0.443
#10 10 4.61 1.15 0.476 0.0132
# … with 90 more rows
(使用tidy
)并一次提取所有分量
broom
答案 3 :(得分:1)
您可以使用group_modify
library(tidyverse)
contrived_data %>%
group_by(subgroup) %>%
group_modify(~{
ks <- ks.test(.$value, "pnorm", mean = 5, sd = 1)
tibble(
avg = mean(.$value),
std_dev = sd(.$value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
})
或者使用data.table
library(data.table)
setDT(contrived_data)
contrived_data[, {
ks <- ks.test(value, "pnorm", mean = 5, sd = 1)
.(avg = mean(value),
std_dev = sd(value),
ks_stat = ks$statistic,
ks_pval = ks$p.value)
}, by = subgroup]