excel中的数据透视表的一个好处是它们可以自动提供小计。首先,我想知道在dplyr中是否已经创建了可以实现此目的的任何内容。如果没有,实现它的最简单方法是什么?
在下面的示例中,我显示了气缸和化油器数量的平均排量。对于每组气缸(4,6,8),我希望看到该组的平均位移(或总位移,或任何其他汇总统计)。
library(dplyr)
mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl carb mean(disp)
1 4 1 91.38
2 4 2 116.60
3 6 1 241.50
4 6 4 163.80
5 6 6 145.00
6 8 2 345.50
7 8 3 275.80
8 8 4 405.50
9 8 8 301.00
答案 0 :(得分:8)
data.table 这是非常笨重的,但这是一种方式:
library(data.table)
DT <- data.table(mtcars)
rbind(
DT[,.(mean(disp)), by=.(cyl,carb)],
DT[,.(mean(disp), carb=NA), by=.(cyl) ],
DT[,.(mean(disp), cyl=NA), by=.(carb)]
)[order(cyl,carb)]
这给出了
cyl carb V1
1: 4 1 91.3800
2: 4 2 116.6000
3: 4 NA 105.1364
4: 6 1 241.5000
5: 6 4 163.8000
6: 6 6 145.0000
7: 6 NA 183.3143
8: 8 2 345.5000
9: 8 3 275.8000
10: 8 4 405.5000
11: 8 8 301.0000
12: 8 NA 353.1000
13: NA 1 134.2714
14: NA 2 208.1600
15: NA 3 275.8000
16: NA 4 308.8200
17: NA 6 145.0000
18: NA 8 301.0000
我宁愿看到类似R table
的结果,但不知道有任何函数。
dplyr @akrun发现了这个类似的代码
bind_rows(
mtcars %>%
group_by(cyl, carb) %>%
summarise(Mean= mean(disp)),
mtcars %>%
group_by(cyl) %>%
summarise(carb=NA, Mean=mean(disp)),
mtcars %>%
group_by(carb) %>%
summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)
我们可以在函数
中包装重复操作library(lazyeval)
f1 <- function(df, grp, Var, func){
FUN <- match.fun(func)
df %>%
group_by_(.dots=grp) %>%
summarise_(interp(~FUN(v), v=as.name(Var)))
}
m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
m2 <- f1(mtcars, 'carb', 'disp', 'mean')
m3 <- f1(mtcars, 'cyl', 'disp', 'mean')
bind_rows(list(m1, m2, m3)) %>%
arrange(cyl, carb) %>%
rename(Mean=`FUN(disp)`)
carb cyl Mean
1 1 4 91.3800
2 2 4 116.6000
3 NA 4 105.1364
4 1 6 241.5000
5 4 6 163.8000
6 6 6 145.0000
7 NA 6 183.3143
8 2 8 345.5000
9 3 8 275.8000
10 4 8 405.5000
11 8 8 301.0000
12 NA 8 353.1000
13 1 NA 134.2714
14 2 NA 208.1600
15 3 NA 275.8000
16 4 NA 308.8200
17 6 NA 145.0000
18 8 NA 301.0000
对于带有rbindlist
的data.table fill
,可以使任何一个选项都不那么难看:
rbindlist(list(
mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
mtcars %>% group_by(carb) %>% summarise(mean(disp)),
mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)
rbindlist(list(
DT[,mean(disp),by=.(cyl,carb)],
DT[,mean(disp),by=.(cyl)],
DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]
答案 1 :(得分:5)
类似table
与addmargins
的内容(虽然实际上是data.frame
)
library(dplyr)
library(reshape2)
out <- bind_cols(
mtcars %>% group_by(cyl, carb) %>%
summarise(mu = mean(disp)) %>%
dcast(cyl ~ carb),
(mtcars %>% group_by(cyl) %>% summarise(Total=mean(disp)))[,2]
)
margin <- t((mtcars %>% group_by(carb) %>% summarise(Total=mean(disp)))[,2])
rbind(out, c(NA, margin, mean(mtcars$disp))) %>%
`rownames<-`(c(paste("cyl", c(4,6,8)), "Total")) # add some row names
# cyl 1 2 3 4 6 8 Total
# cyl 4 4 91.3800 116.60 NA NA NA NA 105.1364
# cyl 6 6 241.5000 NA NA 163.80 145 NA 183.3143
# cyl 8 8 NA 345.50 275.8 405.50 NA 301 353.1000
# Total NA 134.2714 208.16 275.8 308.82 145 301 230.7219
底行是列方式边距,列名为1:8的列是碳水化合物,而Total是行方向边距。
答案 2 :(得分:4)
也可以简单地加入两组结果:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean(disp))
joined <- full_join(cyl_carb, cyl)
result <- arrange(joined, cyl)
result
给出:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb mean(disp)
(dbl) (dbl) (dbl)
1 4 1 91.3800
2 4 2 116.6000
3 4 NA 105.1364
4 6 1 241.5000
5 6 4 163.8000
6 6 6 145.0000
7 6 NA 183.3143
8 8 2 345.5000
9 8 3 275.8000
10 8 4 405.5000
11 8 8 301.0000
12 8 NA 353.1000
或附加一栏:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean.cyl = mean(disp))
joined <- full_join(cyl_carb, cyl)
joined
给出:
Source: local data frame [9 x 4]
Groups: cyl [?]
cyl carb mean(disp) mean.cyl
(dbl) (dbl) (dbl) (dbl)
1 4 1 91.38 105.1364
2 4 2 116.60 105.1364
3 6 1 241.50 183.3143
4 6 4 163.80 183.3143
5 6 6 145.00 183.3143
6 8 2 345.50 353.1000
7 8 3 275.80 353.1000
8 8 4 405.50 353.1000
9 8 8 301.00 353.1000
答案 3 :(得分:3)
data.table
以上的v1.11
版本
library(data.table)
cubed <- cube(
as.data.table(mtcars),
.(`mean(disp)` = mean(disp)),
by = c("cyl", "carb")
)
#> cyl carb mean(disp)
#> 1: 6 4 163.8000
#> 2: 4 1 91.3800
#> 3: 6 1 241.5000
#> 4: 8 2 345.5000
#> 5: 8 4 405.5000
#> 6: 4 2 116.6000
#> 7: 8 3 275.8000
#> 8: 6 6 145.0000
#> 9: 8 8 301.0000
#> 10: 6 NA 183.3143
#> 11: 4 NA 105.1364
#> 12: 8 NA 353.1000
#> 13: NA 4 308.8200
#> 14: NA 1 134.2714
#> 15: NA 2 208.1600
#> 16: NA 3 275.8000
#> 17: NA 6 145.0000
#> 18: NA 8 301.0000
#> 19: NA NA 230.7219
res <- dcast(
cubed,
cyl ~ carb,
value.var = "mean(disp)"
)
#> cyl NA 1 2 3 4 6 8
#> 1: NA 230.7219 134.2714 208.16 275.8 308.82 145 301
#> 2: 4 105.1364 91.3800 116.60 NA NA NA NA
#> 3: 6 183.3143 241.5000 NA NA 163.80 145 NA
#> 4: 8 353.1000 NA 345.50 275.8 405.50 NA 301
由reprex package(v0.3.0)于2020-02-20创建
来源:https://jozef.io/r912-datatable-grouping-sets/
library(kableExtra)
options(knitr.kable.NA = "")
res <- as.data.frame(res)
names(res)[2] <- "overall"
res[1, 1] <- "overall"
x <- kable(res, "html")
x <- kable_styling(x, "striped")
add_header_above(x, c(" " = 1, "carb" = ncol(res) - 1))
答案 4 :(得分:2)
我知道这可能不是一个非常优雅的解决方案,但我希望它有所帮助:
p <-mtcars %>% group_by(cyl,carb)
p$cyl <- as.factor(p$cyl)
average_disp <- sapply(1:length(levels(p$cyl)), function(x)mean(subset(p,p$cyl==levels(p$cyl)[x])$disp))
df <- data.frame(levels(p$cyl),average_disp)
colnames(df)[1]<-"cyl"
#> df
# cyl average_disp
#1 4 105.1364
#2 6 183.3143
#3 8 353.1000
(编辑:在对p
的定义进行微小修改后,现在产生与@ Frank&s和@ akrun解决方案相同的结果)
答案 5 :(得分:2)
这是一个简单的单线程在data_frame中创建边距:
library(plyr)
library(dplyr)
# Margins without labels
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), Mean_Disp=sum(.$Mean_Disp, na.rm=T))))
输出:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <dbl> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 NA 207.98
4 6 1 241.50
5 6 4 163.80
6 6 6 145.00
7 6 NA 550.30
8 8 2 345.50
9 8 3 275.80
10 8 4 405.50
11 8 8 301.00
12 8 NA 1327.80
您还可以为摘要统计信息添加标签,如:
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), carb=c("Total", "Mean"), Mean_Disp=c(sum(.$Mean_Disp, na.rm=T), mean(.$Mean_Disp, na.rm=T)))))
输出:
Source: local data frame [15 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <chr> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 Total 207.98
4 4 Mean 103.99
5 6 1 241.50
6 6 4 163.80
7 6 6 145.00
8 6 Total 550.30
9 6 Mean 183.43
10 8 2 345.50
11 8 3 275.80
12 8 4 405.50
13 8 8 301.00
14 8 Total 1327.80
15 8 Mean 331.95
答案 6 :(得分:0)
您可以在master
周围使用此包装器,对每个可能的边距应用ddply
,并使用通常的输出ddply
结果。
将所有分组因素边缘化:
rbinds
仅限于mtcars %>% ddplym(.variables = .(cyl, carb), .fun = summarise, mean(disp))
边缘:
carb
打包机:
mtcars %>% ddplym(
.variables = .(carb),
.fun = function(data) data %>% group_by(cyl) %>% summarise(mean(disp)))
答案 7 :(得分:0)
为此分享我的方法(如果有帮助的话)。这种方法可以很容易地添加自定义的小计和总计。
data = data.frame( thing1=sprintf("group %i",trunc(runif(200,0,5))),
thing2=sprintf("type %i",trunc(runif(200,0,5))),
value=rnorm(200,0,1) )
data %>%
group_by( thing1, thing2 ) %>%
summarise( sum=sum(value),
count=n() ) %>%
ungroup() %>%
bind_rows(.,
identity(.) %>%
group_by(thing1) %>%
summarise( aggregation="sub total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup(),
identity(.) %>%
summarise( aggregation="total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup() ) %>%
arrange( thing1, thing2, aggregation ) %>%
select( aggregation, everything() )
答案 8 :(得分:0)
我已经为非常相似的问题进行了长时间的努力,但我发现data.table
提供了最适合此目的的最简单,最快的解决方案
data.table::cube(
data.table::as.data.table(mtcars),
.(mean_disp = mean(disp)),
by = c("cyl","carb"))
cyl carb mean_disp
1: 6 4 163.8000
2: 4 1 91.3800
3: 6 1 241.5000
4: 8 2 345.5000
5: 8 4 405.5000
6: 4 2 116.6000
7: 8 3 275.8000
8: 6 6 145.0000
9: 8 8 301.0000
10: 6 NA 183.3143
11: 4 NA 105.1364
12: 8 NA 353.1000
13: NA 4 308.8200
14: NA 1 134.2714
15: NA 2 208.1600
16: NA 3 275.8000
17: NA 6 145.0000
18: NA 8 301.0000
19: NA NA 230.7219
NA
条目是您要寻找的小计;例如,在第10行中,183.31
结果是所有6个圆柱的平均值。具有双NA
的最后一行是整体均值的那一行。
从那里,您可以轻松地用as_tibble()
包装结果,以跳回到dplyr
语义世界。
答案 9 :(得分:0)
遇到了同样的问题,我正在研究一个函数,希望可以解决这个问题(请参阅https://github.com/jrf1111/TCCD/blob/dev/R/with_subtotals.R)。它仍处于开发阶段,但确实可以满足您的需求。
mtcars %>%
group_by(cyl, carb) %>%
with_subtotals() %>%
summarize(mean(disp))
# A tibble: 19 x 3
# Groups: cyl [5]
cyl carb `mean(disp)`
<chr> <chr> <dbl>
1 4 1 91.4
2 4 2 117.
3 4 subtotal 105.
4 6 1 242.
5 6 4 164.
6 6 6 145
7 6 subtotal 183.
8 8 2 346.
9 8 3 276.
10 8 4 406.
11 8 8 301
12 8 subtotal 353.
13 subtotal 1 134.
14 subtotal 2 208.
15 subtotal 3 276.
16 subtotal 4 309.
17 subtotal 6 145
18 subtotal 8 301
19 total total 231.