按R中的dplyr按另一列分组的分类值计数

时间:2015-08-12 04:34:39

标签: r aggregate dplyr

我想按位置名称汇总df。数据看起来像这样:

location <- c("NY", "NC", "KA", "TX", "AZ", "NC", "SC", "ND", "SD", "MN","WA","MA","VT","CA","OR","NJ","OH","MI","IL","GA","FL")
tree_type <- c("pine", "birch", "maple", "palm")
df <- data.frame(location = sample(location, 20, replace = TRUE), 
           tree_type = sample(tree_type, 20, replace = TRUE),
           density = runif(20, min = 24, max = 365), 
           income = runif(20, min = 37000, max = 62000))

我想要的是:

   location mean(density) mean(income) birch maple palm pine
1        AZ      38.44009     52032.95     0     0    1    0
2        CA     136.85112     42243.35     0     1    0    0
3        GA     101.24081     53405.60     2     0    0    0
4        IL     172.02651     46368.42     1     1    0    0
5        MA     198.69868     51117.18     0     0    0    1
6        MI     153.93358     60425.87     1     0    0    0
7        MN     185.05276     46468.68     0     0    1    0
8        NC     181.42187     46007.93     1     0    2    0
9        NJ     302.66541     59316.94     0     0    2    0
10       OR     303.88283     48497.03     0     0    0    2
11       SC      84.05136     50348.41     0     1    0    1
12       SD     158.47423     57894.27     0     0    1    0
13       VT     126.32967     42853.04     0     0    1    0

我是这样做的:

require(dplyr)
require(reshape2)
df_quantvars <- df %>% group_by(location) %>% summarise(mean(density), mean(income))
df_catvarslong <- as.data.frame(table(df[1:2]))
df_catvarswide <- dcast(df_catvarslong, location ~ tree_type, value.var = "Freq")
final_df <- left_join(df_quantvars, df_catvarswide, by = "location")

dplyr group_by成语中是否无法做到这一点?冒着愚蠢的风险,我试着这样做:

df_quantvars <- df %>% group_by(location) %>% summarise(mean(density), mean(income), table(df[1:2]))

我错过了什么?

2 个答案:

答案 0 :(得分:2)

这个回应有点迟,但我已经做了一些工作。保持一切只是有点棘手。这似乎有效:

首先我使用group_by(location, tree_type)来计算所有树,然后我使用group_by(location)来获得所需的方法。然后,我删除原始密度&amp;带有select(-c(density, income)的收入类别,并且留有重复的行但是正确的聚合计数。然后,我使用distinct()删除重复项,然后使用spread()库中的tidyr转换为您请求的宽格式。

library(dplyr)
library(tidyr)

df %>% 
  arrange(location)%>%
  group_by(location, tree_type)%>%
  mutate(Count = n())%>%
  group_by(location)%>%
  mutate(MeanDensity = mean(density), 
         MeanIncome = mean(income))%>%
  ungroup()%>%
  select(-c(density, income))%>%
  distinct()%>%
  spread(key = tree_type, value = Count, fill = 0)

这给了我:

  location MeanDensity MeanIncome birch maple  palm  pine
     (fctr)       (dbl)      (dbl) (dbl) (dbl) (dbl) (dbl)
1        AZ   244.18094   57474.94     0     0     1     0
2        FL    51.90693   42425.36     0     0     0     1
3        GA   341.18643   49385.44     0     0     0     2
4        IL   258.11124   37101.36     0     1     0     0
5        KA   267.92430   59699.20     1     0     0     0
6        MA    87.48623   60632.98     1     0     0     0
7        MI   197.18310   58837.00     0     0     0     1
8        NC   362.48531   50857.42     0     0     1     0
9        ND   315.57415   51465.06     0     0     1     0
10       NJ   233.72886   55877.40     0     0     1     1
11       NY   283.41522   49275.58     0     1     0     1
12       OH   350.23362   40901.73     0     0     1     0
13       OR   267.68415   38954.04     0     2     0     0
14       SC   260.12169   52837.10     0     1     0     0
15       SD    76.29782   54986.76     0     1     0     0
16       VT   341.80646   44547.77     1     0     0     0

答案 1 :(得分:0)

这可以分阶段进行。首先,找到一切手段。

x <- df %>% 
  group_by(location) %>% 
  summarise(mean(density), mean(income))

location `mean(density)` `mean(income)`
   <fct>              <dbl>          <dbl>
 1 AZ                 150.          44667.
 2 FL                 262.          53719.
 3 IL                 308.          41077.
 4 KA                 183.          48432.
 5 MI                 192.          61649.
 6 NC                 210.          50838.
 7 NJ                 223.          49958.
 8 OH                 357.          57881.
 9 OR                 336.          52582.
10 SC                 234.          40414.
11 SD                  54.8         48203.

分别计算所有树类型,然后传播。

y <- df %>% group_by(location, tree_type) %>% 
                count() %>% 
                spread(tree_type, n, fill = 0L) #the fill option is great for replacing NAs with 0s

# A tibble: 11 x 5
# Groups:   location [11]
   location birch maple  palm  pine
   <fct>    <int> <int> <int> <int>
 1 AZ           0     1     0     1
 2 FL           0     0     1     0
 3 IL           0     0     0     1
 4 KA           2     0     0     2
 5 MI           0     0     1     0
 6 NC           1     2     0     1
 7 NJ           1     0     1     0
 8 OH           0     0     1     0
 9 OR           1     0     0     0
10 SC           1     0     0     0
11 SD           1     0     1     0

最后,将两个数据框连接在一起。他们将根据各自共享的共同位置值自动加入。我使用了右连接,但是您也可以使用left_join

right_join(x, y)

Joining, by = "location"
# A tibble: 11 x 7
   location `mean(density)` `mean(income)` birch maple  palm  pine
   <fct>              <dbl>          <dbl> <int> <int> <int> <int>
 1 AZ                 150.          44667.     0     1     0     1
 2 FL                 262.          53719.     0     0     1     0
 3 IL                 308.          41077.     0     0     0     1
 4 KA                 183.          48432.     2     0     0     2
 5 MI                 192.          61649.     0     0     1     0
 6 NC                 210.          50838.     1     2     0     1
 7 NJ                 223.          49958.     1     0     1     0
 8 OH                 357.          57881.     0     0     1     0
 9 OR                 336.          52582.     1     0     0     0
10 SC                 234.          40414.     1     0     0     0
11 SD                  54.8         48203.     1     0     1     0

三个阶段似乎很冗长。我们可以将所有内容组合成一个语句。

# This part is equivalent to stage one
df %>% 
  group_by(location) %>% 
  summarise(mean(density), mean(income)) %>% 
# This part is equivalent to stage two and three.
  right_join( 
               df %>% group_by(location, tree_type) %>% 
               count() %>% 
               spread(tree_type, n, fill = 0L)  
             )