获取R中列的频率

时间:2015-06-29 21:05:14

标签: r

我在这种格式的数据框中有数据:

  grp1 grp2 grp3 grp4 result
1    0    1    0    0      1
2    1    0    0    0      0
3    0    0    0    1      1
4    0    0    0    1      1
5    1    0    0    0      0
6    0    1    0    0      1
.
.
.

可以使用

生成
set.seed(13)

groups <- c("grp1", "grp2", "grp3", "grp4", "result")

# Randomly assign each to group and a result
x <- do.call(rbind, lapply(1:50, function(x) c(sample(c(1,0,0,0), 4), sample(0:1, 1))))
df <- data.frame(x)
colnames(df) <- groups

我的目标是将数据格式化为:

  group      freq
1  grp1 0.5625000
2  grp2 0.5000000
3  grp3 0.6250000
4  grp4 0.2857143

频率是每组产生结果的百分比。

到目前为止,我尝试使用dplyr:

library(dplyr)

df %>% 
  group_by(grp1, grp2, grp3, grp4, result) %>% 
  summarize(n = n()) %>% 
  mutate(freq = n / sum(n)) %>%
  select(-n) %>%
  filter(result == 1)

结果

  grp1 grp2 grp3 grp4 result      freq
1    0    0    0    1      1 0.5625000
2    0    0    1    0      1 0.5000000
3    0    1    0    0      1 0.6250000
4    1    0    0    0      1 0.2857143

5 个答案:

答案 0 :(得分:6)

这是data.table尝试

library(data.table)
melt(setDT(df), "result")[, .(freq = sum(value[result == 1])/sum(value)), by = variable]
#    variable      freq
# 1:     grp1 0.2857143
# 2:     grp2 0.6250000
# 3:     grp3 0.5000000
# 4:     grp4 0.5625000

答案 1 :(得分:4)

“频率是每个具有结果的组的百分比”我假设您指的是每组与结果相等的行的百分比。

df %>% 
    tidyr::gather(key = group, value = group_choice, grp1:grp4) %>%
    group_by(group) %>%
    filter(group_choice == 1) %>%
    summarize(freq = mean(group_choice == result))

# Source: local data frame [4 x 2]
# 
#   group      freq
# 1  grp1 0.2857143
# 2  grp2 0.6250000
# 3  grp3 0.5000000
# 4  grp4 0.5625000

答案 2 :(得分:3)

您还可以使用apply

> freq=apply(df,2,function(x){sum(x==1 & df$result==1)/sum(x)})
> data.frame(freq)
#             freq
# grp1   0.2857143
# grp2   0.6250000
# grp3   0.5000000
# grp4   0.5625000
# result 1.0000000

正如@akrun所建议的,你也可以这样做:

summarise_each(df,funs( sum(.==1 & df$result==1)/sum(.))) %>% t()

在这种情况下,apply似乎提供了最快的解决方案:

akrun=function(df)    {summarise_each(df,funs( sum(.==1 & df$result==1)/sum(.))) %>% t()}
user7598=function(df) {apply(df,2,function(x){sum(x==1 & df$result==1)/sum(x)})}
David=function(df)    {melt(setDT(df), "result")[, .(freq = sum(value[result == 1])/sum(value)), by = variable]}
Gregor=function(df)   {df %>% tidyr::gather(key = group, value = group_choice, grp1:grp4) %>% group_by(group) %>% filter(group_choice == 1) %>% summarize(freq = mean(group_choice == result))}

# SPEED TESTS
set.seed(5)
microbenchmark(akrun(df), Gregor(df),user7598(df),David(df))
Unit: microseconds
         expr       min         lq       mean    median         uq       max neval cld
    akrun(df)  9645.860 10509.3940 12690.5538 10848.248 12315.4020 98239.948   100   c
   Gregor(df) 10319.888 11405.6060 12512.9027 11685.120 12237.1120 26211.999   100   c
 user7598(df)   423.662   491.7045   630.8143   563.958   629.8315  2027.243   100   a  
    David(df)  2115.610  2273.5525  2622.7699  2348.005  2475.2295 15491.534   100   b 

注意根据@Gregor答案中OP的评论进行更改。

答案 3 :(得分:0)

如果我理解正确,你想要知道每个组的百分比为“1”,条件是“结果”是1.如果是这样,那么你可以使用apply()函数来总结列,然后除以列的长度。您可以通过在数据框中指定来应用“结果”必须等于1的约束。

请注意,在下面的数据框规范中,我告诉R使用仅适用于前四列,因为“结果”列不需要作为频率计算的一部分合并。

即: df [条件陈述,c(1:4)]

result <- data.frame(apply(df[df$result == 1, c(1:4)], 2,sum)/apply(df[df$result==1, c(1:4) ], 2, length))
colnames(result)<- c("freq")

这会产生以下格式的结果

      freq
grp1 0.1818182
grp2 0.1818182
grp3 0.3636364
grp4 0.2727273

答案 4 :(得分:0)

我认为colSums()适用于此:

rci <- which(names(df)=='result');
data.frame(group=names(df[-rci]),freq=unname(colSums(df[-rci]==1&df[,rci]==1)/colSums(df[-rci])));
##   group      freq
## 1  grp1 0.2857143
## 2  grp2 0.6250000
## 3  grp3 0.5000000
## 4  grp4 0.5625000