我有一个如下所示的数据框:
dat <- structure(list(Geocode = c("1100015", "1100023", "1100031", "1100049",
"1100056", "1100064", "1100072", "1100080", "1100098", "1100106",
"1100114", "1100122", "1100130", "1100148", "1100155", "1100189",
"1100205", "1100254", "1100262", "1100288", "1100296", "1100304",
"1100320", "1100338", "1100346", "1100379", "1100403", "1100452",
"1100502", "1100601"), Location = c("Alta Floresta D'oeste, RO",
"Ariquemes, RO", "Cabixi, RO", "Cacoal, RO", "Cerejeiras, RO",
"Colorado Do Oeste, RO", "Corumbiara, RO", "Costa Marques, RO",
"Espigo D'oeste, RO", "Guajar-Mirim, RO", "Jaru, RO", "Ji-Paran, RO",
"Machadinho D'oeste, RO", "Nova Brasilndia D'oeste, RO", "Ouro Preto Do Oeste, RO",
"Pimenta Bueno, RO", "Porto Velho, RO", "Presidente Mdici, RO",
"Rio Crespo, RO", "Rolim De Moura, RO", "Santa Luzia D'oeste, RO",
"Vilhena, RO", "So Miguel Do Guapor, RO", "Nova Mamor, RO", "Alvorada D'oeste, RO",
"Alto Alegre Dos Parecis, RO", "Alto Paraso, RO", "Buritis, RO",
"Novo Horizonte Do Oeste, RO", "Cacaulandia, RO"), Region = c("Norte",
"Norte", "Norte", "Norte", "Norte", "Norte", "Norte", "Norte",
"Norte", "Norte", "Sul", "Sul", "Sul", "Sul", "Sul",
"Sul", "Sul", "Sul", "Sul", "Sul", "Nordeste", "Nordeste",
"Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste", "Nordeste",
"Nordeste"), Population = c(25578L, 104401L, 6355L, 87226L, 17986L,
18817L, 8842L, 16651L, 32385L, 46632L, 55738L, 130419L, 37167L,
21592L, 39924L, 37512L, 502748L, 22557L, 3750L, 56242L, 8532L,
91801L, 23933L, 27600L, 17063L, 13940L, 20210L, 37838L, 10276L,
6367L)), .Names = c("Geocode", "Location", "Region", "Population"
), row.names = c(NA, 30L), class = "data.frame")
它显示了一些城市的人口以及城市所属的地区。
我需要将人口分类为休息(breaks=c(0,50000,100000)
),然后根据休息时间查找城市的数量,包括整体(所有地区)和按地区分开。
结果数据框应如下所示(随机,假设值):
Class Region Count
[0-50000] Norte 7
[50000-100000] Norte 3
[>100000] Norte 0
[0-50000] Sul 5
[50000-100000] Sul 4
[>100000] Sul 1
[0-50000] Nordeste 4
[50000-100000] Nordeste 5
[>100000] Nordeste 1
[0-50000] All 16
[50000-100000] All 12
[>100000] All 2
任何帮助表示感谢。
答案 0 :(得分:4)
使用cut
和dplyr
dat$Class=cut(dat$Population,c(0,50000,100000,Inf),labels=c('0-50000','50000-100000','>100000'))
library(dplyr)
d1=dat%>%group_by(Class,Region)%>%summarise(count=n())
d2=dat%>%group_by(Class)%>%summarise(count=n(),Region='All')
bind_rows(d1,d2)
Class Region count
<fctr> <chr> <int>
1 0-50000 Nordeste 9
2 0-50000 Norte 8
3 0-50000 Sul 6
4 50000-100000 Nordeste 1
5 50000-100000 Norte 1
6 50000-100000 Sul 2
7 >100000 Norte 1
8 >100000 Sul 2
9 0-50000 All 23
10 50000-100000 All 4
11 >100000 All 3
答案 1 :(得分:1)
这是一个快速而肮脏的方法,可能会稍后对其进行更新以使其更加干净并避免使用bind_rows()
尝试以下方法:
library(tidyverse)
dat_1 <- dat %>%
mutate(population_breaks = case_when(Population <= 50000 ~ "0-50000",
Population >= 50000 & Population <= 100000 ~ "50000-100000",
Population >= 100000 ~ ">100000")) %>%
group_by(population_breaks) %>%
count(Region)
dat_2 <- dat %>%
mutate(population_breaks = case_when(Population <= 50000 ~ "0-50000",
Population >= 50000 & Population <= 100000 ~ "50000-100000",
Population >= 100000 ~ ">100000")) %>%
group_by(population_breaks) %>%
count(population_breaks) %>%
mutate(Region = "All")
bind_rows(dat_1, dat_2)
返回:
# A tibble: 11 x 3
# Groups: population_breaks [3]
population_breaks Region n
<chr> <chr> <int>
1 0-50000 Nordeste 9
2 50000-100000 Nordeste 1
3 >100000 Norte 1
4 0-50000 Norte 8
5 50000-100000 Norte 1
6 >100000 Sul 2
7 0-50000 Sul 6
8 50000-100000 Sul 2
9 >100000 All 3
10 0-50000 All 23
11 50000-100000 All 4