我在df中有5个变量,我想对每个变量使用以下模式来汇总它们:
min, quantile(.25), median, mean, qunatile(.75), max, sd
这是我的样本df,请告知操作方法
df <- structure(list(user_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), obs_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), scroll_id = c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), timestamp = c(1540202430007839232,
1540202430009840640, 1540202430010982656, 1540202430010982656,
1540202430011841792, 1540202430013843200, 1540202430015844608,
1540202430017846016, 1540202430019847168, 1540202430020992512
), start_time = c(1540202430007839232, 1540202430007839232, 1540202430007839232,
1540202430007839232, 1540202430007839232, 1540202430007839232,
1540202430007839232, 1540202430007839232, 1540202430007839232,
1540202430007839232), end_time = c(1540202430075907328, 1540202430075907328,
1540202430075907328, 1540202430075907328, 1540202430075907328,
1540202430075907328, 1540202430075907328, 1540202430075907328,
1540202430075907328, 1540202430075907328), gra_x = c(0.028428223,
0.028428223, 0.024191462, 0.024191462, 0.024191462, 0.024191462,
0.024191462, 0.024191462, 0.024191462, 0.008488427), gra_y = c(3.3407776,
3.3407776, 3.329257, 3.329257, 3.329257, 3.329257, 3.329257,
3.329257, 3.329257, 3.2995365), gra_z = c(9.220019, 9.220019,
9.224198, 9.224198, 9.224198, 9.224198, 9.224198, 9.224198, 9.224198,
9.234899), lin_acc_mag = c(1.87035263799625, 1.87035263799625,
1.87035263799625, 1.46659090346921, 1.46659090346921, 1.46659090346921,
1.46659090346921, 1.46659090346921, 1.46659090346921, 1.46659090346921
), vel_ang_unc_mag = c(0.363288181726866, 0.397720202371128,
0.397720202371128, 0.397720202371128, 0.462731530212917, 0.493683807731099,
0.553539962506893, 0.607457519129218, 0.629400228979264, 0.629400228979264
)), .Names = c("user_id", "obs_id", "scroll_id", "timestamp",
"start_time", "end_time", "gra_x", "gra_y", "gra_z", "lin_acc_mag",
"vel_ang_unc_mag"), row.names = c(NA, -10L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = c("user_id", "obs_id",
"scroll_id"), drop = TRUE, indices = list(0:9), group_sizes = 10L, biggest_group_size = 10L, labels = structure(list(
user_id = 1L, obs_id = 1L, scroll_id = 3L), row.names = c(NA,
-1L), class = "data.frame", vars = c("user_id", "obs_id", "scroll_id"
), drop = TRUE, .Names = c("user_id", "obs_id", "scroll_id")))
df %>%
summarize_at(vars(gra_x, gra_y, gra_z, lin_acc_mag, vel_ang_unc_mag),
funs(min, max, mean, median, sd, quantile, probs = c(0.25, 0.75)))
返回错误:
summarise_impl(.data,点)中的错误:列
gra_x_quantile
必须是 长度1(汇总值),而不是5
答案 0 :(得分:2)
如果您将每个概率的调用quantile
分开,这是正常的,因为它期望得到长度为1的结果。
library(tidyverse)
df <- tibble::tribble(
~user_id, ~obs_id, ~scroll_id, ~timestamp, ~start_time, ~end_time, ~gra_x, ~gra_y, ~gra_z, ~lin_acc_mag, ~vel_ang_unc_mag,
1L, 1L, 3L, 1540202430007839232, 1540202430007839232, 1540202430075907328, 0.028428223, 3.3407776, 9.220019, 1.87035263799625, 0.363288181726866,
1L, 1L, 3L, 1540202430009840640, 1540202430007839232, 1540202430075907328, 0.028428223, 3.3407776, 9.220019, 1.87035263799625, 0.397720202371128,
1L, 1L, 3L, 1540202430010982656, 1540202430007839232, 1540202430075907328, 0.024191462, 3.329257, 9.224198, 1.87035263799625, 0.397720202371128,
1L, 1L, 3L, 1540202430010982656, 1540202430007839232, 1540202430075907328, 0.024191462, 3.329257, 9.224198, 1.46659090346921, 0.397720202371128,
1L, 1L, 3L, 1540202430011841792, 1540202430007839232, 1540202430075907328, 0.024191462, 3.329257, 9.224198, 1.46659090346921, 0.462731530212917,
1L, 1L, 3L, 1540202430013843200, 1540202430007839232, 1540202430075907328, 0.024191462, 3.329257, 9.224198, 1.46659090346921, 0.493683807731099,
1L, 1L, 3L, 1540202430015844608, 1540202430007839232, 1540202430075907328, 0.024191462, 3.329257, 9.224198, 1.46659090346921, 0.553539962506893,
1L, 1L, 3L, 1540202430017846016, 1540202430007839232, 1540202430075907328, 0.024191462, 3.329257, 9.224198, 1.46659090346921, 0.607457519129218,
1L, 1L, 3L, 1540202430019847168, 1540202430007839232, 1540202430075907328, 0.024191462, 3.329257, 9.224198, 1.46659090346921, 0.629400228979264,
1L, 1L, 3L, 1540202430020992512, 1540202430007839232, 1540202430075907328, 0.008488427, 3.2995365, 9.234899, 1.46659090346921, 0.629400228979264
)
df %>%
summarize_at(vars(gra_x, gra_y, gra_z, lin_acc_mag, vel_ang_unc_mag),
funs(min, max, mean, median, sd, quantile(.,probs = c(0.25)), quantile(., probs = 0.75)))
#> # A tibble: 1 x 30
#> gra_x_min gra_y_min gra_z_min lin_acc_mag_min vel_ang_unc_mag~ gra_x_max
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.00849 3.30 9.22 1.47 0.363 0.0284
#> # ... with 24 more variables: gra_y_max <dbl>, gra_z_max <dbl>,
#> # lin_acc_mag_max <dbl>, vel_ang_unc_mag_max <dbl>, gra_x_mean <dbl>,
#> # gra_y_mean <dbl>, gra_z_mean <dbl>, lin_acc_mag_mean <dbl>,
#> # vel_ang_unc_mag_mean <dbl>, gra_x_median <dbl>, gra_y_median <dbl>,
#> # gra_z_median <dbl>, lin_acc_mag_median <dbl>,
#> # vel_ang_unc_mag_median <dbl>, gra_x_sd <dbl>, gra_y_sd <dbl>,
#> # gra_z_sd <dbl>, lin_acc_mag_sd <dbl>, vel_ang_unc_mag_sd <dbl>,
#> # gra_x_quantile <dbl>, gra_y_quantile <dbl>, gra_z_quantile <dbl>,
#> # lin_acc_mag_quantile <dbl>, vel_ang_unc_mag_quantile <dbl>
由reprex package(v0.2.1)于2018-11-04创建
编辑:
如果您想要大量的概率,那么您可能不想手工编写所有内容。停留在tidyverse中,一种解决方法是使用tidyeval。
操作方法如下:
# generate a vector with quantile probs you want (here 50)
quant <- seq(0, 1, 0.2)
# create a list of function, one per quantile wanted
lqfun <- map(quant, ~ partial(quantile, probs = .x, .lazy = FALSE)) %>%
set_names(paste0("probs", quant*100))
head(lqfun, 2)
#> $probs0
#> function (...)
#> quantile(probs = 0, ...)
#> <environment: 0x000000001cd61730>
#>
#> $probs20
#> function (...)
#> quantile(probs = 0.2, ...)
#> <environment: 0x000000001cd70f18>
# Apply these function in funs usint splicing.
sum1 <- df %>%
summarize_at(vars(gra_x, gra_y, gra_z, lin_acc_mag, vel_ang_unc_mag),
funs(min, max, mean, median, sd, !!!lqfun))
names(sum1)
#> [1] "gra_x_min" "gra_y_min"
#> [3] "gra_z_min" "lin_acc_mag_min"
#> [5] "vel_ang_unc_mag_min" "gra_x_max"
#> [7] "gra_y_max" "gra_z_max"
#> [9] "lin_acc_mag_max" "vel_ang_unc_mag_max"
#> [11] "gra_x_mean" "gra_y_mean"
#> [13] "gra_z_mean" "lin_acc_mag_mean"
#> [15] "vel_ang_unc_mag_mean" "gra_x_median"
#> [17] "gra_y_median" "gra_z_median"
#> [19] "lin_acc_mag_median" "vel_ang_unc_mag_median"
#> [21] "gra_x_sd" "gra_y_sd"
#> [23] "gra_z_sd" "lin_acc_mag_sd"
#> [25] "vel_ang_unc_mag_sd" "gra_x_probs0"
#> [27] "gra_y_probs0" "gra_z_probs0"
#> [29] "lin_acc_mag_probs0" "vel_ang_unc_mag_probs0"
#> [31] "gra_x_probs20" "gra_y_probs20"
#> [33] "gra_z_probs20" "lin_acc_mag_probs20"
#> [35] "vel_ang_unc_mag_probs20" "gra_x_probs40"
#> [37] "gra_y_probs40" "gra_z_probs40"
#> [39] "lin_acc_mag_probs40" "vel_ang_unc_mag_probs40"
#> [41] "gra_x_probs60" "gra_y_probs60"
#> [43] "gra_z_probs60" "lin_acc_mag_probs60"
#> [45] "vel_ang_unc_mag_probs60" "gra_x_probs80"
#> [47] "gra_y_probs80" "gra_z_probs80"
#> [49] "lin_acc_mag_probs80" "vel_ang_unc_mag_probs80"
#> [51] "gra_x_probs100" "gra_y_probs100"
#> [53] "gra_z_probs100" "lin_acc_mag_probs100"
#> [55] "vel_ang_unc_mag_probs100"
由reprex package(v0.2.1)于2018-11-04创建
您将在此处输入所有内容。另一种解决方案是使用列表列。一列分位数,包含一个列表,quantile(., probs=quant)
的结果。然后,您可以取消嵌套或操作列表列。
答案 1 :(得分:1)
您可以尝试以下2种包装...
library(Hmisc)
describe(df[,c(gra_x, gra_y, gra_z, lin_acc_mag, vel_ang_unc_mag)])
library(psych)
describe.by(df[,c(gra_x, gra_y, gra_z, lin_acc_mag, vel_ang_unc_mag)])
summary(df[,c(gra_x, gra_y, gra_z, lin_acc_mag, vel_ang_unc_mag)])