我有这个df
:
> df <- data.frame(Adults = sample(0:5, 10, replace = TRUE),
+ Children = sample(0:2, 10, replace = TRUE),
+ Teens = sample(1:3, 10, replace = TRUE),
+ stringsAsFactors = FALSE)
> df
Adults Children Teens
1 5 0 1
2 5 1 2
3 5 2 3
4 5 2 2
5 0 1 2
6 5 1 3
7 0 2 3
8 4 2 1
9 4 0 1
10 1 2 1
我们可以看到Children
没有3,4,5
值,而Teens
没有0,4,5
值。但是,我们知道Adults
,Children
和Teens
可能来自0 to 5
。
当我将group_by()
与summarise()
一起使用时,summarise
会删除我没有分组的列。代码:
df %>%
group_by(Adults) %>% mutate(n_Adults = n()) %>%
group_by(Teens) %>% mutate(n_Teens = n()) %>%
group_by(Children) %>% mutate(n_Children = n())
当我按c(0,1,2,3,4,5)
分组(以获取所有可能的值)时,出现此错误:
Error in mutate_impl(.data, dots) : Column `c(0, 1, 2, 3, 4, 5)` must be length 10 (the number of rows) or one, not 6
我正在寻找以下输出:
Values n_Adults n_Children n_Teens p_Adults p_Children p_Teens
0 2 2 0 0.2 0.2 0
1 1 3 4 0.1 0.1 0.4
2 0 5 3 0 0 0.3
3 0 0 3 0 0 0.3
4 2 0 0 0.2 0.2 0
5 5 0 0 0.5 0.5 0
其中n_
是相应列的计数,p_
是相应列的百分比。
答案 0 :(得分:2)
我们可以将数据gather
转换为“长”格式,在将“值”转换为count
并将factor
指定为0:5的情况下,使用levels
获得频率。 ,将spread
设置为“宽”格式,并通过除以每列的sum
来创建“ p”列,并在需要时更改列名称(使用rename_at
)
library(tidyverse)
gather(df) %>%
count(key, value = factor(value, levels = 0:5)) %>%
spread(key, n, fill = 0) %>%
mutate_at(2:4, list(p = ~./sum(.)))%>%
rename_at(2:4, ~ paste0(.x, "_n"))
df <- structure(list(Adults = c(1L, 1L, 4L, 3L, 3L, 5L, 1L, 4L, 4L,
1L), Children = c(1L, 1L, 2L, 2L, 0L, 2L, 0L, 0L, 1L, 0L), Teens = c(1L,
2L, 3L, 1L, 1L, 3L, 1L, 2L, 2L, 1L)), class = "data.frame", row.names = c(NA,
-10L))
答案 1 :(得分:0)
library(reprex)
library(tidyverse)
set.seed(20)
df <- data.frame(Adults = sample(0:5, 10, replace = TRUE),
Children = sample(0:2, 10, replace = TRUE),
Teens = sample(1:3, 10, replace = TRUE),
stringsAsFactors = FALSE)
df
#> Adults Children Teens
#> 1 5 2 2
#> 2 4 2 1
#> 3 1 0 2
#> 4 3 2 1
#> 5 5 0 1
#> 6 5 1 1
#> 7 0 0 3
#> 8 0 0 3
#> 9 1 0 1
#> 10 2 2 3
df_adults <- df %>%
count(Adults) %>%
rename( n_Adults = n)
df_childred <- df %>%
count(Children) %>%
rename( n_Children = n)
df_teens <- df %>%
count(Teens) %>%
rename( n_Teens = n)
df_new <- data.frame(unique_id = 0:5)
df_new <- left_join(df_new,df_adults, by = c("unique_id"="Adults"))
df_new <- left_join(df_new,df_childred, by = c("unique_id"="Children"))
df_new <- left_join(df_new,df_teens, by = c("unique_id"="Teens"))
df_new <- df_new %>%
replace_na(list( n_Adults=0, n_Children=0, n_Teens=0))
df_new %>%
mutate(p_Adults = n_Adults/sum(n_Adults),p_Children = n_Children/sum(n_Children), p_Teens = n_Teens/sum(n_Teens))
#> unique_id n_Adults n_Children n_Teens p_Adults p_Children p_Teens
#> 1 0 2 5 0 0.2 0.5 0.0
#> 2 1 2 1 5 0.2 0.1 0.5
#> 3 2 1 4 2 0.1 0.4 0.2
#> 4 3 1 0 3 0.1 0.0 0.3
#> 5 4 1 0 0 0.1 0.0 0.0
#> 6 5 3 0 0 0.3 0.0 0.0
由reprex package(v0.2.1)于2019-02-25创建