Question

我有一个如下所示的数据框：

dat <- structure(list(Geocode = c("1100015", "1100023", "1100031", "1100049", 
"1100056", "1100064", "1100072", "1100080", "1100098", "1100106", 
"1100114", "1100122", "1100130", "1100148", "1100155", "1100189", 
"1100205", "1100254", "1100262", "1100288", "1100296", "1100304", 
"1100320", "1100338", "1100346", "1100379", "1100403", "1100452", 
"1100502", "1100601"), Location = c("Alta Floresta D'oeste, RO", 
"Ariquemes, RO", "Cabixi, RO", "Cacoal, RO", "Cerejeiras, RO", 
"Colorado Do Oeste, RO", "Corumbiara, RO", "Costa Marques, RO", 
"Espigo D'oeste, RO", "Guajar-Mirim, RO", "Jaru, RO", "Ji-Paran, RO", 
"Machadinho D'oeste, RO", "Nova Brasilndia D'oeste, RO", "Ouro Preto Do Oeste, RO", 
"Pimenta Bueno, RO", "Porto Velho, RO", "Presidente Mdici, RO", 
"Rio Crespo, RO", "Rolim De Moura, RO", "Santa Luzia D'oeste, RO", 
"Vilhena, RO", "So Miguel Do Guapor, RO", "Nova Mamor, RO", "Alvorada D'oeste, RO", 
"Alto Alegre Dos Parecis, RO", "Alto Paraso, RO", "Buritis, RO", 
"Novo Horizonte Do Oeste, RO", "Cacaulandia, RO"), Region = c("Norte", 
"Norte", "Norte", "Norte", "Norte", "Norte", "Norte", "Norte", 
"Norte", "Norte", "Sul", "Sul", "Sul", "Sul", "Sul", 
"Sul", "Sul", "Sul", "Sul", "Sul", "Nordeste", "Nordeste", 
"Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste", 
"Nordeste"), Population = c(25578L, 104401L, 6355L, 87226L, 17986L, 
18817L, 8842L, 16651L, 32385L, 46632L, 55738L, 130419L, 37167L, 
21592L, 39924L, 37512L, 502748L, 22557L, 3750L, 56242L, 8532L, 
91801L, 23933L, 27600L, 17063L, 13940L, 20210L, 37838L, 10276L, 
6367L)), .Names = c("Geocode", "Location", "Region", "Population"
), row.names = c(NA, 30L), class = "data.frame")

它显示了一些城市的人口以及城市所属的地区。

我需要将人口分类为休息（breaks=c(0,50000,100000)），然后根据休息时间查找城市的数量，包括整体（所有地区）和按地区分开。

结果数据框应如下所示（随机，假设值）：

Class                  Region       Count
[0-50000]               Norte        7
[50000-100000]          Norte        3
[>100000]               Norte        0
[0-50000]               Sul          5
[50000-100000]          Sul          4
[>100000]               Sul          1
[0-50000]               Nordeste     4
[50000-100000]          Nordeste     5
[>100000]               Nordeste     1
[0-50000]               All          16
[50000-100000]          All          12
[>100000]               All          2

任何帮助表示感谢。

Answer 1

使用cut和dplyr

dat$Class=cut(dat$Population,c(0,50000,100000,Inf),labels=c('0-50000','50000-100000','>100000'))
library(dplyr)
d1=dat%>%group_by(Class,Region)%>%summarise(count=n())
d2=dat%>%group_by(Class)%>%summarise(count=n(),Region='All')
bind_rows(d1,d2)

          Class   Region count
         <fctr>    <chr> <int>
 1      0-50000 Nordeste     9
 2      0-50000    Norte     8
 3      0-50000      Sul     6
 4 50000-100000 Nordeste     1
 5 50000-100000    Norte     1
 6 50000-100000      Sul     2
 7      >100000    Norte     1
 8      >100000      Sul     2
 9      0-50000      All    23
10 50000-100000      All     4
11      >100000      All     3

Answer 2

这是一个快速而肮脏的方法，可能会稍后对其进行更新以使其更加干净并避免使用bind_rows()

尝试以下方法：

library(tidyverse)

dat_1 <- dat %>% 
  mutate(population_breaks = case_when(Population <= 50000 ~ "0-50000",
                                       Population >= 50000 & Population <= 100000 ~ "50000-100000",
                                       Population >= 100000 ~ ">100000")) %>% 
  group_by(population_breaks) %>% 
  count(Region)

dat_2 <- dat %>% 
  mutate(population_breaks = case_when(Population <= 50000 ~ "0-50000",
                                       Population >= 50000 & Population <= 100000 ~ "50000-100000",
                                       Population >= 100000 ~ ">100000")) %>% 
  group_by(population_breaks) %>% 
  count(population_breaks) %>% 
  mutate(Region = "All")

bind_rows(dat_1, dat_2)

返回：

# A tibble: 11 x 3
# Groups:   population_breaks [3]
   population_breaks   Region     n
               <chr>    <chr> <int>
 1           0-50000 Nordeste     9
 2      50000-100000 Nordeste     1
 3           >100000    Norte     1
 4           0-50000    Norte     8
 5      50000-100000    Norte     1
 6           >100000      Sul     2
 7           0-50000      Sul     6
 8      50000-100000      Sul     2
 9           >100000      All     3
10           0-50000      All    23
11      50000-100000      All     4

R - 按分组切割并按组计算出现次数

2 个答案: