我试图在this post中完成相同的工作,即用密度叠加多个组织图。
推荐帖子中的解决方案有效,但我想知道dfn
是否可以使用purrr
/ purrrlyr
这样的新包进行计算:
set.seed(1)
df <- data.frame(bsa=rnorm(200, mean=rep(c(1,4),each=100)),
group=rep(c("test","control"),each=100))
stats <- df %>% group_by(group) %>% summarise(m = mean(bsa), sd = sd(bsa))
x <- with(df, seq(min(bsa), max(bsa), len=100))
dfn <- do.call(rbind,lapply(1:nrow(stats),
function(i) with(stats[i,],data.frame(group, x, y=dnorm(x,mean=m,sd=sd)))))
为了执行内部lapply
部分,我一直在尝试
stats %>%
dplyr::group_by(group) %>%
purrr::map(x, dnorm, m, sd)
也就是说,从统计信息传递m
和sd
并使用相同的x
。不幸的是,它不起作用。
(一旦内部部分完成,我可以将结果传递给do.call
,这样就不会有问题了。
答案 0 :(得分:2)
如果你dplyr
,我认为你真的不需要分别计算stats
和x
。我做了:
dfn_2 <-
df %>%
mutate_at(vars(bsa), funs(min, max)) %>%
arrange(group) %>%
group_by(group) %>%
transmute(
x = seq(first(min), first(max), length.out = n()),
y = dnorm(x, mean(bsa), sd(bsa))
) %>%
as.data.frame()
all.equal(dfn, dfn_2)
# [1] TRUE
或者,我有两种方法不推荐。只是为了证明它是可能的,以及你如何做你正在尝试的事情:
dfn_3 <-
stats %>%
split(.$group) %>%
map2_df(names(.), ~ tibble(group = .y, x, y = dnorm(x, .x$m, .x$sd)))
# # A tibble: 200 x 3
# group x y
# <chr> <dbl> <dbl>
# 1 control -1.888921 6.490182e-09
# 2 control -1.809524 1.045097e-08
# 3 control -1.730128 1.672139e-08
# 4 control -1.650731 2.658301e-08
# 5 control -1.571334 4.199062e-08
# 6 control -1.491938 6.590471e-08
# 7 control -1.412541 1.027772e-07
# 8 control -1.333145 1.592550e-07
# 9 control -1.253748 2.451917e-07
# 10 control -1.174352 3.750891e-07
# # ... with 190 more rows
all.equal(dfn, as.data.frame(mutate_at(dfn_3, vars(group), as.factor)))
# [1] TRUE
dfn_4 <-
stats %>%
group_by(group) %>%
transmute(x = list(x), y = map(x, dnorm, m, sd)) %>%
ungroup() %>%
tidyr::unnest()
# # A tibble: 200 x 3
# group x y
# <fctr> <dbl> <dbl>
# 1 control -1.888921 6.490182e-09
# 2 control -1.809524 1.045097e-08
# 3 control -1.730128 1.672139e-08
# 4 control -1.650731 2.658301e-08
# 5 control -1.571334 4.199062e-08
# 6 control -1.491938 6.590471e-08
# 7 control -1.412541 1.027772e-07
# 8 control -1.333145 1.592550e-07
# 9 control -1.253748 2.451917e-07
# 10 control -1.174352 3.750891e-07
# # ... with 190 more rows
all.equal(dfn, as.data.frame(dfn_4))
# [1] TRUE
答案 1 :(得分:1)
按@ Aurele的要求,这是我的看法:
library(dplyr)
library(tidyr)
library(ggplot2)
df <- data.frame(bsa=rnorm(200, mean=rep(c(1,4),each=100)),
group=rep(c("test","control"),each=100))
df %>%
group_by(group) %>%
summarise_all(funs(mean, sd, min, max)) %>%
group_by(group) %>%
mutate(newdata = list(data_frame(x = seq(min, max, length.out = 80)))) %>%
unnest() %>%
mutate(dens = dnorm(x, mean, sd)) %>%
ggplot() +
geom_histogram(data = df, aes(bsa, y = ..density.., fill = group), alpha = 0.5) +
geom_line(aes(x, dens, color = group), size = 2)
df %>%
group_by(group) %>%
summarise_all(funs(mean, sd, min, max)) %>%
group_by(group, mean, sd, min, max) %>%
do(data_frame(x = seq(.$min, .$max, length.out = 80))) %>%
mutate(dens = dnorm(x, mean, sd)) %>%
ggplot() +
geom_histogram(data = df, aes(bsa, y = ..density.., fill = group), alpha = 0.5) +
geom_line(aes(x, dens, color = group), size = 2)
我的两种方法是相同的,只是在生成新数据时略有不同。
list
的新newdata
列,其中包含从 min 到 max 的 x 值的序列 bsa ,然后tidyr::unnest
展开do
的全新数据框,每个数据框都从环境中继承统计信息列