我有一对城市V1和V2的数据集。每个城市的人口分别为v1_pop2015和v2_pop2015。
我想创建一个新的数据集,其中只包含最大城市的cityCode以及最小人口的人口总数。
我能够使用for循环创建所需的输出。出于教育目的,我尝试使用tidyverse工具来完成此任务。
这是一个有效的示例
library(tidyverse)
## Sample dataset
pairs_pop <- structure(list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792,
20779), cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595
), v1_pop2015 = c(414, 682, 497, 3639, 384, 596), v2_pop2015 = c(384,
757, 5716, 315, 367, 1303)), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"))
pairs_pop
#> # A tibble: 6 x 4
#> cityCodeV1 cityCodeV2 v1_pop2015 v2_pop2015
#> * <dbl> <dbl> <dbl> <dbl>
#> 1 20073 20063 414 384
#> 2 20888 204024 682 757
#> 3 20222 20183 497 5716
#> 4 22974 20406 3639 315
#> 5 23792 23586 384 367
#> 6 20779 23595 596 1303
#### This is working !!!
clean_df <- setNames(data.frame(matrix(ncol = 2, nrow = dim(pairs_pop)[1])),c("to_keep", "to_keep_pop"))
# For each row, determine which city is the biggest and adds the two cities population
for (i in 1:dim(pairs_pop)[1]) {
if(pairs_pop$v1_pop2015[i] > pairs_pop$v2_pop2015[i])
{
clean_df$to_keep[i] = pairs_pop$cityCodeV1[i]
clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
}
else
{
clean_df$to_keep[i] = pairs_pop$cityCodeV2[i]
clean_df$to_keep_pop[i] = pairs_pop$v1_pop2015[i] + pairs_pop$v2_pop2015[i]
}
}
clean_df
#> to_keep to_keep_pop
#> 1 20073 798
#> 2 204024 1439
#> 3 20183 6213
#> 4 22974 3954
#> 5 23792 751
#> 6 23595 1899
这就是我被困住的地方
### trying to tidy it with rowwise, mutate and a function
v1_sup_tov2 <- function(x){
print(x)
if(x$v1_pop2015 > x$v2_pop2015){
return (TRUE)
}
return(FALSE)
}
to_clean_df2 <- pairs_pop %>%
rowwise() %>%
mutate_if(v1_sup_tov2,
to_keep = cityCodeV1,
to_delete= cityCodeV2,
to_keep_pop = v1_pop2015 + v2_pop2015)
预期输出是一个具有2个列的数据框,如下所示: to_keep:我要保留的城市的cityCode to_keep_pop:该城市的人口
clean_df
#> to_keep to_keep_pop
#> 1 20073 798
#> 2 204024 1439
#> 3 20183 6213
#> 4 22974 3954
#> 5 23792 751
#> 6 23595 1899
答案 0 :(得分:3)
那呢?
library(dplyr)
## Sample dataset
pairs_pop <- structure(
list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
clean_df <- transmute(pairs_pop,
to_keep = if_else(v1_pop2015 > v2_pop2015, cityCodeV1, cityCodeV2),
to_keep_pop = v1_pop2015 + v2_pop2015)
答案 1 :(得分:3)
以防万一有一天您有多个城市拥有v1,v2,v3,... 不要忘记将所有信息保留在数据框中,以便您知道什么值与什么有关。整洁的数据框。
library(dplyr)
## Sample dataset
pairs_pop <- structure(
list(cityCodeV1 = c(20073, 20888, 20222, 22974, 23792, 20779),
cityCodeV2 = c(20063, 204024, 20183, 20406, 23586, 23595),
v1_pop2015 = c(414, 682, 497, 3639, 384, 596),
v2_pop2015 = c(384, 757, 5716, 315, 367, 1303)),
row.names = c(NA, 6L), class = c("tbl_df", "tbl", "data.frame"))
# Tidy dataset with all information that was in columns
library(dplyr)
library(tidyr)
library(stringr)
tidy_pairs <- pairs_pop %>%
mutate(city = 1:n()) %>%
gather("key", "value", -city) %>%
mutate(ville = str_extract(key, "([[:digit:]])"),
key = case_when(
grepl("cityCode", key) ~ "cityCode",
grepl("pop", key) ~ "pop",
TRUE ~ "other"
)) %>%
spread(key, value)
然后您可以应用所需的测试
tidy_pairs %>%
group_by(city) %>%
summarise(to_keep = cityCode[pop == max(pop)],
to_keep_pop = sum(pop))