我有一个数据集(前100行):
structure(list(department = structure(c(21L, 14L, 4L, 11L, 21L,
12L, 15L, 11L, 3L, 18L, 4L, 20L, 25L, 3L, 3L, 13L, 19L, 22L,
18L, 16L, 16L, 16L, 16L, 4L, 20L, 12L, 4L, 27L, 1L, 6L, 16L,
1L, 13L, 13L, 25L, 18L, 8L, 23L, 10L, 16L, 4L, 21L, 2L, 5L, 18L,
10L, 23L, 4L, 7L, 5L, 14L, 15L, 19L, 23L, 11L, 4L, 15L, 6L, 12L,
11L, 23L, 14L, 15L, 11L, 18L, 24L, 27L, 27L, 20L, 5L, 1L, 19L,
4L, 10L, 4L, 26L, 3L, 14L, 15L, 12L, 22L, 14L, 20L, 25L, 2L,
23L, 15L, 13L, 4L, 18L, 26L, 13L, 5L, 10L, 1L, 6L, 10L, 22L,
5L, 14L), .Label = c("Beauty", "Boutique advisor", "Boutique advisors",
"Boutique Stylist", "Clean Beauty Expert", "Conseiller en boutique",
"Design Consultant", "Designer Trade Specialist", "Food", "Furniture",
"In-store Design Expert", "In-store experts", "In-Store Sales Professional",
"In-Store Style Experts", "John Hardy", "Jos. A. Bank LIVE!",
"Levi's Stylists", "Lighting & Home Accessories", "Men's Wearhouse LIVE!",
"Menswear", "Personal advisors", "Styliste en boutique", "Vendeurs",
"Wine", "Women's Accessories", "Women's shoes", "Womenswear"), class = "factor"),
type = c("Completed", "Missed", "Missed", "Missed", "Missed",
"Missed", "Missed", "Completed", "Completed", "Missed", "Missed",
"Completed", "Completed", "Completed", "Completed", "Completed",
"Completed", "Completed", "Completed", "Missed", "Completed",
"Missed", "Completed", "Missed", "Missed", "Completed", "Missed",
"Missed", "Missed", "Completed", "Missed", "Completed", "Missed",
"Completed", "Missed", "Missed", "Completed", "Missed", "Missed",
"Completed", "Completed", "Missed", "Completed", "Missed",
"Completed", "Missed", "Missed", "Completed", "Missed", "Completed",
"Completed", "Missed", "Completed", "Missed", "Completed",
"Completed", "Missed", "Missed", "Missed", "Missed", "Completed",
"Missed", "Completed", "Completed", "Completed", "Missed",
"Missed", "Completed", "Missed", "Completed", "Completed",
"Missed", "Completed", "Completed", "Missed", "Missed", "Completed",
"Completed", "Completed", "Completed", "Missed", "Completed",
"Completed", "Completed", "Completed", "Completed", "Completed",
"Completed", "Completed", "Completed", "Completed", "Missed",
"Missed", "Completed", "Completed", "Completed", "Missed",
"Completed", "Missed", "Completed"), date = structure(c(17889,
17890, 17893, 17893, 17892, 17892, 17893, 17893, 17892, 17888,
17892, 17889, 17888, 17893, 17888, 17889, 17891, 17892, 17893,
17891, 17889, 17888, 17892, 17889, 17889, 17892, 17888, 17889,
17893, 17892, 17893, 17892, 17891, 17893, 17888, 17891, 17892,
17891, 17892, 17888, 17891, 17893, 17893, 17892, 17890, 17888,
17888, 17889, 17891, 17893, 17893, 17890, 17890, 17892, 17889,
17892, 17889, 17889, 17888, 17888, 17893, 17893, 17893, 17891,
17888, 17892, 17892, 17893, 17891, 17888, 17889, 17891, 17889,
17890, 17891, 17888, 17889, 17888, 17890, 17893, 17889, 17889,
17893, 17889, 17892, 17891, 17889, 17892, 17888, 17891, 17893,
17890, 17890, 17889, 17893, 17889, 17889, 17888, 17889, 17892
), class = "Date"), count = c(7L, 9L, 8L, 3L, 5L, 4L, 5L,
10L, 1L, 3L, 5L, 18L, 3L, 7L, 1L, 17L, 277L, 10L, 14L, 50L,
520L, 92L, 791L, 6L, 7L, 4L, 2L, 1L, 3L, 3L, 145L, 17L, 10L,
42L, 1L, 1L, 1L, 2L, 7L, 627L, 3L, 6L, 4L, 3L, 3L, 2L, 1L,
2L, 1L, 20L, 41L, 4L, 283L, 1L, 14L, 5L, 2L, 1L, 3L, 3L,
7L, 12L, 36L, 9L, 14L, 1L, 6L, 13L, 1L, 14L, 12L, 16L, 3L,
2L, 6L, 7L, 4L, 21L, 3L, 5L, 5L, 22L, 12L, 5L, 1L, 5L, 23L,
36L, 13L, 12L, 12L, 9L, 4L, 6L, 6L, 4L, 1L, 4L, 1L, 32L)), row.names = c(NA,
100L), class = "data.frame")
我需要它看起来像这样(按部门分组(行)和每天每种类型的各自计数(列)):
目前,我有两种方法可以解决这个问题,但是都无法达到预期的效果,但是我怀疑我已经接近了,因为解决方案似乎介于两者之间。
第一种方法:
library(dplyr) # For the purpose of this reproducible example should you need it
dept %>%
group_by(
department
) %>%
summarise(
missed = sum(type == "Missed"),
completed = sum(type == "Completed"),
missed_pct = missed / (missed + completed)
)
哪个给我这个:
# A tibble: 7 x 4
department missed completed missed_pct
<fct> <int> <int> <dbl>
1 Beauty 2 5 0.286
2 Food 0 1 0
3 Menswear 4 6 0.4
4 Wine 1 1 0.5
5 Women's Accessories 2 5 0.286
6 Women's shoes 3 5 0.375
7 Womenswear 4 5 0.444
第二种方法:
library(dplyr) # For the purpose of this reproducible example should you need it
dept %>%
group_by(
department,
date
) %>%
summarise(
missed = sum(type == "Missed"),
completed = sum(type == "Completed"),
missed_pct = missed / (missed + completed)
)
哪个给我这个:
# A tibble: 28 x 5
# Groups: department [?]
department date missed completed missed_pct
<fct> <date> <int> <int> <dbl>
1 Beauty 2018-12-23 0 1 0
2 Beauty 2018-12-24 0 1 0
3 Beauty 2018-12-26 0 1 0
4 Beauty 2018-12-27 1 1 0.5
5 Beauty 2018-12-28 1 1 0.5
6 Food 2018-12-27 0 1 0
7 Menswear 2018-12-23 1 1 0.5
8 Menswear 2018-12-24 1 1 0.5
9 Menswear 2018-12-25 0 1 0
10 Menswear 2018-12-26 1 1 0.5
我该怎么做?
答案 0 :(得分:1)
代替分组,您需要将数据的格式从长格式更改为宽格式。这称为“投射”。
library(reshape2)
dcast(dept, department + type ~ date, fun.aggregate = sum)
哪个给:
department type 2018-12-23 2018-12-24 2018-12-25 2018-12-26 2018-12-27 2018-12-28
1 Beauty Completed 0 12 0 0 17 6
2 Beauty Missed 0 0 0 0 0 3
3 Boutique advisor Completed 0 0 0 0 1 4
4 Boutique advisors Completed 1 4 0 0 1 7
5 Boutique Stylist Completed 13 5 0 3 5 0
6 Boutique Stylist Missed 2 6 0 6 5 8
您的图像还显示了%行。你需要这个吗?
编辑:要添加百分比行,请在重塑之前对其进行计算:
dept %>%
# create the percentage rows by grouping by department/date/type. Later we will combine these rows back with the original data
group_by(department, date, type) %>%
# add a column n with the sum of count in each group
summarise(n=sum(count)) %>%
# do 2 separate things:
# - add a percent column
# - change all the values in the type column to have a % at the end so they don't get mixed up with the original values later
mutate(percent = n * 100 / sum(n), type = paste(type, "%")) %>%
# remove all rows except the percent ones
filter(type == "Missed %") %>%
# remove the temporary 'n' column we created earlier, and rename the percent column to 'count' so it can go through the 'dcast' function later without any problems
select(department, type, count = percent, "date") %>%
# append with the original data
bind_rows(dept) %>%
# cast the data with the date column used as columns
# and fill it with the sum of the 'count' column
# the percentage rows we created earlier will pass through the function unharmed as there is only one of them in each department/type/date
dcast(department + type ~ date, fun.aggregate = sum, value.var = "count")
哪个给:
department type 2018-12-23 2018-12-24 2018-12-25 2018-12-26 2018-12-27 2018-12-28
1 Beauty Completed 0 12 0 0 17 6.00000
2 Beauty Missed 0 0 0 0 0 3.00000
3 Beauty Missed % 0 0 0 0 0 33.33333
4 Boutique advisor Completed 0 0 0 0 1 4.00000
5 Boutique advisors Completed 1 4 0 0 1 7.00000
6 Boutique Stylist Completed 13 5 0 3 5 0.00000
答案 1 :(得分:1)
使用dplyr
:
library(dplyr)
dept %>%
group_by(department, date, type) %>%
summarise(count = sum(count, na.rm = T)) %>% # data had a few duplicate rows
spread(type, count, fill = 0) %>%
as.data.frame() %>%
group_by(department, date) %>%
mutate(missed_pct = Missed / (Missed + Completed)) %>%
melt(id.vars = c("department", "date")) %>%
spread(date, value) %>%
rename(type = variable)
让我知道您是否希望解释其中的任何内容。本质上,只是多次使用散布和合并来确保数据在每个阶段都按我们想要的结构进行构建(我建议您进行每个合并/散布,并注意data.frame的形状。)